def get_indel_info(self, query_name, cigarstr, qrseq, glseq, gene): cigars = re.findall('[0-9][0-9]*[A-Z]', cigarstr) # split cigar string into its parts cigars = [(cstr[-1], int(cstr[:-1])) for cstr in cigars] # split each part into the code and the length codestr = '' qpos = 0 # position within query sequence indelfo = utils.get_empty_indel() # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions tmp_indices = [] for code, length in cigars: codestr += length * code if code == 'I': # advance qr seq but not gl seq indelfo['indels'].append({'type' : 'insertion', 'pos' : qpos, 'len' : length, 'seqstr' : ''}) # insertion begins at <pos> tmp_indices += [len(indelfo['indels']) - 1 for _ in range(length)]# indel index corresponding to this position in the alignment elif code == 'D': # advance qr seq but not gl seq indelfo['indels'].append({'type' : 'deletion', 'pos' : qpos, 'len' : length, 'seqstr' : ''}) # first deleted base is <pos> (well, first base which is in the position of the first deleted base) tmp_indices += [len(indelfo['indels']) - 1 for _ in range(length)]# indel index corresponding to this position in the alignment else: tmp_indices += [None for _ in range(length)] # indel index corresponding to this position in the alignment qpos += length qrprintstr, glprintstr = '', '' iqr, igl = 0, 0 for icode in range(len(codestr)): code = codestr[icode] if code == 'M': qrbase = qrseq[iqr] if qrbase != glseq[igl]: qrbase = utils.color('red', qrbase) qrprintstr += qrbase glprintstr += glseq[igl] indelfo['reversed_seq'] += qrseq[iqr] # add the base to the overall sequence with all indels reversed elif code == 'S': continue elif code == 'I': qrprintstr += utils.color('light_blue', qrseq[iqr]) glprintstr += utils.color('light_blue', '*') indelfo['indels'][tmp_indices[icode]]['seqstr'] += qrseq[iqr] # and to the sequence of just this indel igl -= 1 elif code == 'D': qrprintstr += utils.color('light_blue', '*') glprintstr += utils.color('light_blue', glseq[igl]) indelfo['reversed_seq'] += glseq[igl] # add the base to the overall sequence with all indels reversed indelfo['indels'][tmp_indices[icode]]['seqstr'] += glseq[igl] # and to the sequence of just this indel iqr -= 1 else: raise Exception('unhandled code %s' % code) iqr += 1 igl += 1 if self.debug: print '\n indels in %s' % query_name print ' %20s %s' % (gene, glprintstr) print ' %20s %s' % ('query', qrprintstr) for idl in indelfo['indels']: print ' %10s: %d bases at %d (%s)' % (idl['type'], idl['len'], idl['pos'], idl['seqstr']) # utils.undo_indels(indelfo) # print ' %s' % self.input_info[query_name]['seq'] return indelfo
def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # all gene matches for this query self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.glfo['seqs']['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.glfo['seqs']['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.glfo['seqs']['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] self.info[query_name]['indelfo'] = self.info['indels'].get(query_name, utils.get_empty_indel()) for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info['all_best_matches'].add(best[region]) self.info['all_matches'][region] |= set(match_names[region]) self.info[query_name]['seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence existing_implicit_keys = tuple(['cdr3_length', 'cyst_position', 'tryp_position']) utils.add_implicit_info(self.glfo, self.info[query_name], multi_seq=False, existing_implicit_keys=existing_implicit_keys) if self.debug: if not self.args.is_data: utils.print_reco_event(self.glfo['seqs'], self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.glfo['seqs'], self.info[query_name], extra_str=' ', label='inferred:') if self.alfinder is not None: self.alfinder.increment(self.info[query_name]) if self.pcounter is not None: self.pcounter.increment_all_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_all_params(self.reco_info[query_name]) if self.perfplotter is not None: if query_name in self.info['indels']: print ' skipping performance evaluation of %s because of indels' % query_name # I just have no idea how to handle naive hamming fraction when there's indels else: self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) self.remaining_queries.remove(query_name)
def add_shm_indels(self, reco_event): if self.args.debug and self.args.indel_frequency > 0.: print ' indels' for iseq in range(len(reco_event.final_seqs)): reco_event.indelfos.append(utils.get_empty_indel()) if self.args.indel_frequency == 0.: # no indels at all continue if numpy.random.uniform(0, 1) > self.args.indel_frequency: # no indels for this sequence if self.args.debug: print ' 0' continue seq = reco_event.final_seqs[iseq] reco_event.indelfos[-1]['reversed_seq'] = seq # set the original sequence (i.e. with all the indels reversed) n_indels = 1 #numpy.random.geometric(1. / self.args.mean_n_indels) if self.args.debug: print ' %d' % n_indels for _ in range(n_indels): seq = self.add_single_indel(seq, reco_event) reco_event.final_seqs[iseq] = seq
def try_scratch_erode_insert(tmpline): utils.remove_all_implicit_info(tmpline) for erosion in utils.real_erosions: # includes various contortions to avoid eroding the entire gene region = erosion[0] gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']]) if self.args.chain != 'h' and region == 'd': # light chains dummy d treatment assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.chain] tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0 # always erode the whole dummy d from the left else: max_erosion = max(0, gene_length/2 - 2) # now that, son, is a heuristic if region in utils.conserved_codons[self.args.chain]: codon_pos = self.glfo[utils.conserved_codons[self.args.chain][region] + '-positions'][tmpline[region + '_gene']] if '3p' in erosion: n_bases_to_codon = gene_length - codon_pos - 3 elif '5p' in erosion: n_bases_to_codon = codon_pos max_erosion = min(max_erosion, n_bases_to_codon) tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1) for bound in utils.boundaries: mean_length = utils.scratch_mean_insertion_lengths[self.args.chain][bound] length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1 probs = [self.insertion_content_probs[bound][n] for n in utils.nukes] tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs)) # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator) gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions} for erosion in utils.real_erosions: region = erosion[0] e_length = tmpline[erosion + '_del'] if '5p' in erosion: gl_seqs[region] = gl_seqs[region][e_length:] elif '3p' in erosion: gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length] tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ] tmpline['indelfos'] = [utils.get_empty_indel(), ] utils.add_implicit_info(self.glfo, tmpline) assert len(tmpline['in_frames']) == 1
def add_shm_indels(self, reco_event): if self.args.debug and self.args.indel_frequency > 0.: print ' indels' reco_event.indelfos = [ utils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] for iseq in range(len(reco_event.final_seqs)): if self.args.indel_frequency == 0.: # no indels at all continue if numpy.random.uniform( 0, 1 ) > self.args.indel_frequency: # no indels for this sequence if self.args.debug: print ' 0' continue reco_event.indelfos[iseq]['reversed_seq'] = reco_event.final_seqs[ iseq] # set the original sequence (i.e. with all the indels reversed) n_indels = 1 #numpy.random.geometric(1. / self.args.mean_n_indels) if self.args.debug: print ' %d' % n_indels for _ in range(n_indels): reco_event.final_seqs[iseq] = self.add_single_indel( reco_event.final_seqs[iseq], reco_event.indelfos[iseq], reco_event.final_codon_positions)
def add_mutants(self, reco_event, irandom): if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.: # some of the stuff below fails if mut mult is actually 0. reco_event.final_seqs.append( reco_event.recombined_seq) # set final sequnce in reco_event reco_event.indelfos = [ utils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] return chosen_treeinfo = self.treeinfo[random.randint(0, len(self.treeinfo) - 1)] chosen_tree = chosen_treeinfo.split(';')[0] + ';' branch_length_ratios = { } # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file for tmpstr in chosen_treeinfo.split(';')[1].split( ',' ): # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) region = tmpstr.split(':')[0] assert region in utils.regions ratio = float(tmpstr.split(':')[1]) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor # if self.args.debug: # print ' adding branch length factor %f ' % self.args.mutation_multiplier ratio *= self.args.mutation_multiplier branch_length_ratios[region] = ratio if self.args.debug: # NOTE should be the same for t[0-9]... but I guess I should check at some point print ' using tree with total depth %f' % treegenerator.get_leaf_node_depths( chosen_tree )['t1'] # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes... if len(re.findall('t', chosen_tree)) > 1: # if more than one leaf Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick')) else: print ' one leaf' print ' with branch length ratios ', ', '.join([ '%s %f' % (region, branch_length_ratios[region]) for region in utils.regions ]) scaled_trees = self.get_rescaled_trees(chosen_tree, branch_length_ratios) treg = re.compile('t[0-9][0-9]*') n_leaf_nodes = len(treg.findall(chosen_tree)) cmdfos = [] for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions[ 'vd'] + simstr + reco_event.insertions['dj'] cmdfos.append( self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaf_nodes, reco_event.genes[region], reco_event, seed=irandom)) utils.run_cmds( [cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range(len(utils.regions)): if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = [ '' for _ in range(n_leaf_nodes) ] # return an empty string for each leaf node else: mseqs[utils.regions[ireg]] = self.read_bppseqgen_output( cmdfos[ireg], n_leaf_nodes) assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaf_nodes): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons( seq ) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append( seq) # set final sequnce in reco_event self.add_shm_indels(reco_event)
def try_scratch_erode_insert(self, tmpline, debug=False): utils.remove_all_implicit_info(tmpline) for erosion in utils.real_erosions: # includes various contortions to avoid eroding the entire gene region = erosion[0] gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']]) if region == 'd' and not utils.has_d_gene( self.args.locus ): # dummy d genes: always erode the whole thing from the left assert gene_length == 1 and tmpline[ 'd_gene'] == glutils.dummy_d_genes[self.args.locus] tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0 else: max_erosion = max(0, gene_length / 2 - 2) # heuristic if region in utils.conserved_codons[ self.args. locus]: # make sure not to erode a conserved codon codon_pos = self.glfo[ utils.conserved_codons[self.args.locus][region] + '-positions'][tmpline[region + '_gene']] if '3p' in erosion: n_bases_to_codon = gene_length - codon_pos - 3 elif '5p' in erosion: n_bases_to_codon = codon_pos max_erosion = min(max_erosion, n_bases_to_codon) tmpline[erosion + '_del'] = min( max_erosion, numpy.random.geometric( 1. / utils.scratch_mean_erosion_lengths[erosion]) - 1) for bound in utils.boundaries: mean_length = utils.scratch_mean_insertion_lengths[ self.args.locus][bound] length = 0 if mean_length == 0 else numpy.random.geometric( 1. / mean_length) - 1 probs = [ self.insertion_content_probs[bound][n] for n in utils.nukes ] tmpline[bound + '_insertion'] = ''.join( numpy.random.choice(utils.nukes, size=length, p=probs)) if debug: print ' erosions: %s' % (' '.join( [('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions])) print ' insertions: %s' % (' '.join( [('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries])) # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator) gl_seqs = { r: self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions } for erosion in utils.real_erosions: region = erosion[0] e_length = tmpline[erosion + '_del'] if '5p' in erosion: gl_seqs[region] = gl_seqs[region][e_length:] elif '3p' in erosion: gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length] tmpline['seqs'] = [ gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ] tmpline['indelfos'] = [ utils.get_empty_indel(), ] utils.add_implicit_info(self.glfo, tmpline) assert len(tmpline['in_frames']) == 1