def add_single_indel(indelfo, pos, length, gapped_codon_positions, keep_in_frame=False, debug=False): ifo = {'type' : None, 'pos' : pos, 'len' : length, 'seqstr' : None} if numpy.random.uniform(0, 1) < 0.5: # fifty-fifty chance of insertion and deletion ifo['type'] = 'insertion' ifo['seqstr'] = ''.join([utils.nukes[random.randint(0, len(utils.nukes) - 1)] for _ in range(length)]) if utils.gap_len(ifo['seqstr']) > 0: # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something print ' failed adding indel (overlaps with previous one)' return indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + ifo['seqstr'] + indelfo['qr_gap_seq'][pos:] indelfo['gl_gap_seq'] = indelfo['gl_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['gl_gap_seq'][pos:] for region in gapped_codon_positions: if pos < gapped_codon_positions[region]: # this isn\'t right if the indel is actually in the codon, but in that case we just let the messed up codon through below gapped_codon_positions[region] += length for otherfo in indelfo['indels']: # correct the positions of any existing indels that're to the right of this one if otherfo['pos'] > pos: otherfo['pos'] += ifo['len'] else: ifo['type'] = 'deletion' ifo['seqstr'] = indelfo['gl_gap_seq'][pos : pos + length] # NOTE it's kind of unclear whether this should be the bit in the qr or gl seq. Using the gl like this probably makes more sense, since it corresponds to what we would infer in s-w (i.e., if we _do_ delete some SHMd positions, we will never know about it, so who cares) if utils.gap_len(ifo['seqstr']) > 0: # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something print ' failed adding indel (overlaps with previous one)' return indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['qr_gap_seq'][pos + length : ] if not utils.codon_unmutated('cyst', indelfo['qr_gap_seq'], gapped_codon_positions['v']): if debug: print ' adding indel within %s codon' % 'cyst' indelfo['indels'].append(ifo) indelfo['indels'] = sorted(indelfo['indels'], key=lambda q: q['pos']) if debug: print get_dbg_str(indelfo)
def generate_snpd_gene(gene, cpos, seq, positions): assert utils.get_region(gene) == 'v' # others not yet handled def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.codon_unmutated('cyst', tmpseq, cpos, debug=True): snp_pos = random.randint(0, len(seq) - 1) # note that randint() is inclusive tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :] # for checking cyst position return snp_pos snpd_positions = set() # only used if a position wasn't specified (i.e. was None) in <snps_to_add> mutfo = OrderedDict() for snp_pos in positions: if snp_pos is None: snp_pos = choose_position() snpd_positions.add(snp_pos) new_base = None while new_base is None or new_base == seq[snp_pos]: new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)] print ' %3d %s --> %s' % (snp_pos, seq[snp_pos], new_base) mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base} seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :] assert utils.codon_unmutated('cyst', seq, cpos, debug=True) # this is probably unnecessary snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo) return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
def remove_v_genes_with_bad_cysteines(glfo, debug=False): prelength = len(glfo['seqs']['v']) for gene in glfo['seqs']['v'].keys(): # have to use a copy of the keys, since we modify the dict in the loop mutated = not utils.codon_unmutated('cyst', glfo['seqs']['v'][gene], glfo['cyst-positions'][gene]) in_frame = utils.in_frame_germline_v(glfo['seqs']['v'][gene], glfo['cyst-positions'][gene]) if mutated or not in_frame: remove_gene(glfo, gene, debug=debug) if True: # debug: print ' removed %d / %d v genes with bad cysteines' % (prelength - len(glfo['seqs']['v']), len(glfo['seqs']['v']))
def add_single_indel( seq, indelfo, mean_length, codon_positions, indel_location=None, pos=None, keep_in_frame=False, debug=False): # NOTE modifies <indelfo> and <codon_positions> # if <pos> is specified we use that, otherwise we use <indel_location> to decide the region of the sequence from which to choose a position if pos is None: if indel_location is None: # uniform over entire sequence pos = random.randint( 5, len(seq) - 6 ) # this will actually exclude either before the first index or after the last index. No, I don't care. elif indel_location == 'v': # within the meat of the v pos = random.randint(5, codon_positions['v']) elif indel_location == 'cdr3': # inside cdr3 pos = random.randint(codon_positions['v'], codon_positions['j']) else: assert False length = numpy.random.geometric(1. / mean_length) if keep_in_frame: itry = 0 while length % 3 != 0: length = numpy.random.geometric(1. / mean_length) itry += 1 if itry > 99: raise Exception( 'tried too many times to get in-frame indel length') if numpy.random.uniform( 0, 1) < 0.5: # fifty-fifty chance of insertion and deletion new_seq = add_insertion(indelfo, seq, pos, length, debug=debug) else: deleted_seq = seq[:pos] + seq[ pos + length:] # delete <length> bases beginning with <pos> indelfo['indels'].append({ 'type': 'deletion', 'pos': pos, 'len': length, 'seqstr': seq[pos:pos + length] }) if debug: print ' deleting %d bases at %d' % (length, pos) new_seq = deleted_seq for region in codon_positions: if pos < codon_positions[ region]: # this isn\'t right if the indel is actually in the codon, but in that case we just let the messed up codon through below codon_positions[region] += sign(indelfo['indels'][-1]) * length if not utils.codon_unmutated('cyst', new_seq, codon_positions['v']): print ' adding indel within %s codon' % 'cyst' return new_seq
def revert_conserved_codons(self, seq, debug=False): """ revert conserved cysteine and tryptophan to their original bases, eg if they were messed up by s.h.m. """ for region, pos in self.post_erosion_codon_positions.items(): # NOTE this happens *before* shm indels, i.e. we use self.post_erosion_codon_positions rather than self.final_codon_positions if seq[pos : pos + 3] != self.unmutated_codons[region]: assert len(self.unmutated_codons[region]) == 3 if debug: print ' reverting %s --> %s' % (seq[pos : pos + 3], self.unmutated_codons[region]) seq = seq[:pos] + self.unmutated_codons[region] + seq[pos + 3 :] assert utils.codon_unmutated(utils.conserved_codons[self.glfo['locus']][region], seq, pos) return seq
def revert_conserved_codons(self, seq, debug=False): """ revert conserved cysteine and tryptophan to their original bases, eg if they were messed up by s.h.m. """ for region, pos in self.post_erosion_codon_positions.items( ): # NOTE this happens *before* shm indels, i.e. we use self.post_erosion_codon_positions rather than self.final_codon_positions if seq[pos:pos + 3] != self.unmutated_codons[region]: assert len(self.unmutated_codons[region]) == 3 if debug: print ' reverting %s --> %s' % ( seq[pos:pos + 3], self.unmutated_codons[region] ) # this doesn't happen *much* any more, but bppseqgen barfs if we pass it rates that are exactly zero, so it still happens sometimes seq = seq[:pos] + self.unmutated_codons[region] + seq[pos + 3:] assert utils.codon_unmutated( utils.conserved_codons[self.glfo['locus']][region], seq, pos) return seq
def trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=False): nearest_template_gene = glutils.find_nearest_gene_using_names( template_glfo, gene) nearest_template_seq = template_glfo['seqs'][region][nearest_template_gene] # extra_bases = glfo['cyst-positions'][gene] - template_glfo['cyst-positions'][nearest_template_gene] # not right if there's some internal gaps in the alignment aligned_nearest_template_seq, aligned_seq = utils.align_seqs( nearest_template_seq, seq) if debug: print ' %s' % utils.color_gene(gene) utils.color_mutants(aligned_nearest_template_seq, aligned_seq, print_result=True, ref_label='template ', extra_str=' ') if aligned_seq[0] not in utils.gap_chars and aligned_nearest_template_seq[ 0] not in utils.gap_chars: if debug: print ' ok' elif aligned_seq[0] in utils.gap_chars: if debug: print ' %s, removing' % utils.color('red', 'too small') glutils.remove_gene(glfo, gene) else: if debug: print ' extra bases %s' % utils.color_gene(gene) extra_bases = len(aligned_nearest_template_seq) - len( aligned_nearest_template_seq.lstrip('-')) seq = seq[extra_bases:] if debug: print ' removed %d bases' % extra_bases if seq in glfo['seqs'][region].values(): print ' trimmed seq already in glfo under name %s, so removing it' % ' '.join( [ utils.color_gene(g) for g, s in glfo['seqs'][region].items() if s == seq ]) glutils.remove_gene(glfo, gene, debug=True) return glfo['seqs'][region][gene] = seq glfo['cyst-positions'][gene] -= extra_bases # utils.color_mutants(nearest_template_seq, seq, print_result=True, ref_label='template ', align=True, extra_str=' ') assert utils.codon_unmutated('cyst', glfo['seqs'][region][gene], glfo['cyst-positions'][gene], debug=True)
def check_a_bunch_of_codons(codon, seqons, extra_str='', debug=False): # seqons: list of (seq, pos) pairs """ check a list of sequences, and keep track of some statistics """ n_total, n_ok, n_too_short, n_bad_codons = 0, 0, 0, 0 for seq, pos in seqons: n_total += 1 if len(seq) < pos + 3: n_too_short += 1 elif utils.codon_unmutated(codon, seq, pos): n_ok += 1 else: n_bad_codons += 1 if debug: print '%s%d %s positions:' % (extra_str, n_total, codon), if n_ok > 0: print ' %d ok' % n_ok, if n_too_short > 0: print ' %d too short' % n_too_short, if n_bad_codons > 0: print ' %d mutated' % n_bad_codons, print ''
def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.codon_unmutated('cyst', tmpseq, cpos, debug=True): snp_pos = random.randint(0, len(seq) - 1) # note that randint() is inclusive tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :] # for checking cyst position return snp_pos
def get_missing_codon_info(glfo, debug=False): # debug = 2 for region, codon in utils.conserved_codons[glfo['locus']].items(): missing_genes = set(glfo['seqs'][region]) - set(glfo[codon + '-positions']) if len(missing_genes) == 0: if debug: print ' no missing %s info' % codon continue if debug: print ' missing %d %s positions' % (len(missing_genes), codon) aligned_seqs = get_new_alignments(glfo, region, debug=debug) # if region == 'j': # raise Exception('missing tryp position for %s, and we can\'t infer it because tryp positions don\'t reliably align to the same position' % ' '.join(missing_genes)) # existing codon position (this assumes that once aligned, all genes have the same codon position -- which is only really true for the imgt-gapped alignment) if len(glfo[codon + '-positions']) > 0: known_gene, known_pos = None, None known_but_not_in_glfo, known_but_unaligned, known_but_mutated = [], [], [] for gene, pos in glfo[codon + '-positions'].items(): # take the first one for which we have the sequence (NOTE it would be safer to check that they're all the same) if gene not in glfo['seqs'][region]: known_but_not_in_glfo.append(gene) continue if gene not in aligned_seqs: known_but_unaligned.append(gene) continue if not utils.codon_unmutated(codon, glfo['seqs'][region][gene], pos): known_but_mutated.append(gene) continue known_gene, known_pos = gene, pos break if known_gene is None: raise Exception('couldn\'t find a known %s position\n known but not in glfo: %s\n known but unaligned: %s\n known but mutated: %s' % (codon, ' '.join(known_but_not_in_glfo), ' '.join(known_but_unaligned), ' '.join(known_but_mutated))) # NOTE for cyst, should be 309 if alignments are imgt [which they used to usually be, but now probably aren't] (imgt says 104th codon --> subtract 1 to get zero-indexing, then multiply by three 3 * (104 - 1) = 309 known_pos_in_alignment = get_pos_in_alignment(codon, aligned_seqs[known_gene], glfo['seqs'][region][known_gene], known_pos, debug=debug) if debug: print ' using known position %d (aligned %d) from %s' % (known_pos, known_pos_in_alignment, known_gene) elif codon == 'cyst': known_pos_in_alignment = 309 print ' assuming aligned %s position is %d (this will %s work if you\'re using imgt alignments)' % (codon, known_pos_in_alignment, utils.color('red', 'only')) raise Exception('not really using imgt alignments much any more, so this isn\'t really going to work') else: raise Exception('no existing %s info, and couldn\'t guess it, either' % codon) n_added = 0 seqons = [] # (seq, pos) pairs for gene in [known_gene] + list(missing_genes): unaligned_pos = known_pos_in_alignment - utils.count_gaps(aligned_seqs[gene], istop=known_pos_in_alignment) seq_to_check = glfo['seqs'][region][gene] seqons.append((seq_to_check, unaligned_pos)) glfo[codon + '-positions'][gene] = unaligned_pos n_added += 1 if debug > 1: tmpseq = aligned_seqs[gene] tmppos = known_pos_in_alignment print ' %s%s%s %s %3s %5s' % (tmpseq[:tmppos], utils.color('reverse_video', tmpseq[tmppos : tmppos + 3]), tmpseq[tmppos + 3:], utils.color_gene(gene, width=12 if region == 'v' else 8), '' if tmpseq[tmppos : tmppos + 3] in utils.codon_table[codon] else utils.color('red', 'bad'), 'new' if gene != known_gene else '') check_a_bunch_of_codons(codon, seqons, extra_str=' ', debug=debug) if debug: print ' added %d %s positions' % (n_added, codon)
def get_pos_in_alignment(codon, aligned_seq, seq, pos, debug=False): """ given <pos> in <seq>, find the codon's position in <aligned_seq> """ assert utils.codon_unmutated(codon, seq, pos, debug=debug) # this only gets called on the gene with the *known* position, so it shouldn't fail pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos) assert utils.codon_unmutated(codon, aligned_seq, pos_in_alignment, debug=debug) return pos_in_alignment