def __init__(self, l): self.id = l['id'] self.organism = l['organism'] self.chain = l['chain'] self.region = l['region'] self.nucseq = l['nucseq'] self.alseq = l['aligned_protseq'] self.cdrs = l['cdrs'].split(cdrs_sep) if l['cdrs'] else [] ## these are still 1-indexed !!!!!!!!!!!!!! self.cdr_columns = [ map(int, x.split('-')) for x in l['cdr_columns'].split(cdrs_sep) ] if self.cdrs else [] frame = l['frame'] assert frame in ['+1', '+2', '+3', '1', '2', '3'] self.nucseq_offset = int( frame[-1]) - 1 ## 0, 1 or 2 (0-indexed for python) self.protseq = translation.get_translation(self.nucseq, frame)[0] assert self.protseq == self.alseq.replace(gap_character, '') # sanity check if self.cdrs: assert self.cdrs == [ self.alseq[x[0] - 1:x[1]] for x in self.cdr_columns ]
id = '' for line in open(fastafile, 'r'): if line[0] == '>': id = line[1:-1] myfasta[np][id] = '' else: assert id myfasta[np][id] += line[:-1] all_fasta_sd[organism][ab][vj] = myfasta for id in myfasta[prot]: assert id in myfasta[nuc] pseq = myfasta[prot][id] nseq = myfasta[nuc][id] myframe = -1 for i in range(3): tseq = get_translation(nseq, '+{}'.format(i + 1))[0] if pseq in tseq: myframe = i + 3 * tseq.index(pseq) assert myframe >= 0 num_after = len(nseq) - 3 * len(pseq) - myframe all_offsets[organism][ab][vj][id] = (myframe, num_after) ## make a single tsv file with the following fields ## ## id organism region ## chain is A or B -- where A means alpha-like (VJ recombining) and B means beta-like (VDJ recombining) ## region is V D J ## ## cdrs: comma-separated list of protein sequences for cdr regions ## outfields = "id organism chain region nucseq frame aligned_protseq cdr_columns cdrs".split(
def beta_cdr3_protseq_probability(theid, organism, v_gene, j_gene, cdr3_protseq, cdr3_nucseq='', error_threshold=0.05, verbose=False, allow_early_nucseq_mismatches=True, return_final_cdr3_nucseq=False): nucleotide_match = (cdr3_nucseq != '') if nucleotide_match: assert not cdr3_protseq cdr3_protseq = translation.get_translation(cdr3_nucseq, '+1')[0] assert len(cdr3_nucseq) == 3 * len(cdr3_protseq) ab = 'B' assert all_genes[organism][v_gene].chain == ab v_nucseq = get_v_cdr3_nucseq(organism, v_gene) j_nucseq = get_j_cdr3_nucseq(organism, j_gene) ## what is the largest amount of these nucseqs we could preserve and still get cdr3_protseq max_v_germline = 0 max_j_germline = 0 len_v_nucseq = len(v_nucseq) len_j_nucseq = len(j_nucseq) len_cdr3_nucseq = len(cdr3_nucseq) len_cdr3_protseq = len(cdr3_protseq) if nucleotide_match: if allow_early_nucseq_mismatches: mismatch_score = default_mismatch_score_for_cdr3_nucseq_probabilities else: mismatch_score = -100 max_v_germline = count_matches(v_nucseq, cdr3_nucseq, mismatch_score) max_j_germline = count_matches(''.join(reversed(list(j_nucseq))), ''.join(reversed(list(cdr3_nucseq))), mismatch_score) if allow_early_nucseq_mismatches: ## obliterate the mismatches now max_v, max_j = max_v_germline, max_j_germline if max_v + max_j > len(cdr3_nucseq): ## some overlap! extra = max_v + max_j - len(cdr3_nucseq) #print 'TRIM extra',extra fake_v_trim = extra / 2 ## now dterministic fake_j_trim = extra - fake_v_trim max_v -= fake_v_trim max_j -= fake_j_trim old_cdr3_nucseq = cdr3_nucseq[:] cdr3_nucseq = v_nucseq[:max_v] + \ cdr3_nucseq[ max_v : len_cdr3_nucseq-max_j ] + \ j_nucseq[len_j_nucseq-max_j:] if old_cdr3_nucseq != cdr3_nucseq: Log('{} early_cdr3a_nucseq_mismatch: before {} after {}'. format(theid, old_cdr3_nucseq, cdr3_nucseq)) assert len(cdr3_nucseq) == len(old_cdr3_nucseq) else: ## V for i in range(len(v_nucseq)): i_aa = i / 3 ## which aa do we code for? len_codon = (i % 3) + 1 if i_aa >= len(cdr3_protseq): break start = 3 * i_aa codon = v_nucseq[start:start + len_codon] target_aa = cdr3_protseq[i_aa] matched = False for c in reverse_genetic_code[target_aa]: if c.startswith(codon): matched = True if verbose: print 'V', codon, target_aa, matched if matched: max_v_germline = i + 1 else: break ## J for i in range(len_j_nucseq): i_aa = i / 3 ## which aa do we code for? len_codon = (i % 3) + 1 if i_aa >= len(cdr3_protseq): break end = len(j_nucseq) - 3 * i_aa codon = j_nucseq[max(0, end - len_codon):end] target_aa = cdr3_protseq[len_cdr3_protseq - 1 - i_aa] matched = False for c in reverse_genetic_code[target_aa]: if c.endswith(codon): matched = True if verbose: print 'J', codon, target_aa, matched if matched: max_j_germline = i + 1 else: break if verbose: print 'max_v_germline:', max_v_germline, len(v_nucseq) ## how about J? min_insert = 3 * len_cdr3_protseq - max_v_germline - max_j_germline if verbose: print 'max_j_germline:',max_j_germline, len_j_nucseq,cdr3_protseq,\ all_genes[organism][j_gene].protseq print 'min_insert:', min_insert, max_v_germline, max_j_germline if organism in ['human', 'mouse'] and j_gene[3] == 'B': trbj_index = int(j_gene[4]) ## to decide which d genes to allow assert trbj_index in [1, 2] else: ## no D/J compatibility check trbj_index = 0 total_prob = 0.0 min_extra_trim = max(0, -1 * min_insert) dids = tcr_rearrangement.all_trbd_nucseq[organism].keys() for extra_trim in range(min_extra_trim, 100): old_total_prob = total_prob total_prob_this_trim = 0.0 for extra_v_trim in range(0, extra_trim + 1): extra_j_trim = extra_trim - extra_v_trim v_trim = len_v_nucseq - max_v_germline + extra_v_trim j_trim = len_j_nucseq - max_j_germline + extra_j_trim if v_trim > len_v_nucseq or j_trim > len_j_nucseq: continue n_insert = min_insert + extra_v_trim + extra_j_trim assert n_insert >= 0 ## b/c of min_extra_trim total_prob_this_insert = 0.0 ## now we are looking to fit part of the D gene into this middle region and still code for the right aas for did in dids: if trbj_index == 1: if did == 1: did_prob = 1.0 else: continue else: did_prob = 1.0 / float(len(dids)) d_nucseq = tcr_rearrangement.all_trbd_nucseq[organism][did] len_d_nucseq = len(d_nucseq) for d0_trim in range(len_d_nucseq + 1): for d1_trim in range(len_d_nucseq + 1): len_d_insert = len_d_nucseq - d0_trim - d1_trim if len_d_insert < 0 or len_d_insert > n_insert: continue #if len_d_insert == 0 and d1_trim: continue ## only hit this one once! d_insert = d_nucseq[d0_trim:len_d_nucseq - d1_trim] num_n = n_insert - len_d_insert for num_n_before_d in range(num_n + 1): num_n_after_d = num_n - num_n_before_d assert num_n_after_d >= 0 n_nucseq = (v_nucseq[:len_v_nucseq - v_trim] + 'n' * num_n_before_d + d_insert + 'n' * num_n_after_d + j_nucseq[j_trim:]) assert len(n_nucseq) == 3 * len_cdr3_protseq trim_prob = tcr_rearrangement.get_beta_trim_probs( organism, did, v_trim, d0_trim, d1_trim, j_trim, num_n_before_d, num_n_after_d) if not trim_prob: continue if nucleotide_match: assert len(n_nucseq) == len_cdr3_nucseq matched = True #print n_nucseq, cdr3_nucseq for a, b in zip(n_nucseq, cdr3_nucseq): if a != b and a != 'n': matched = False if matched: coding_prob = 0.25**num_n else: coding_prob = 0.0 else: coding_prob = get_coding_probability( n_nucseq, cdr3_protseq) prob = did_prob * coding_prob * trim_prob total_prob_this_insert += prob ## just for status output total_prob_this_trim += prob total_prob += prob if verbose and coding_prob: print 'coding_prob:',cdr3_protseq,"trims:",v_trim,d0_trim,d1_trim,j_trim,\ "inserts:",num_n_before_d,num_n_after_d,\ "d_insert:",d_insert,\ "total_prob:",total_prob,"prob:",prob,"coding_prob:",coding_prob,\ "trim_prob:",trim_prob,n_nucseq if verbose: print 'n_insert:',n_insert,extra_v_trim,extra_j_trim,'total_prob:',total_prob,\ 'total_prob_this_insert:',total_prob_this_insert if extra_trim > 2 and total_prob_this_trim < error_threshold * old_total_prob: break if return_final_cdr3_nucseq: return total_prob, cdr3_nucseq else: return total_prob
def alpha_cdr3_protseq_probability(theid, organism, v_gene, j_gene, cdr3_protseq, cdr3_nucseq='', error_threshold=0.05, verbose=False, allow_early_nucseq_mismatches=True, return_final_cdr3_nucseq=False): nucleotide_match = (cdr3_nucseq != '') if nucleotide_match: assert not cdr3_protseq cdr3_protseq = translation.get_translation(cdr3_nucseq, '+1')[0] assert len(cdr3_nucseq) == 3 * len(cdr3_protseq) ab = 'A' assert all_genes[organism][v_gene].chain == ab v_nucseq = get_v_cdr3_nucseq(organism, v_gene) j_nucseq = get_j_cdr3_nucseq(organism, j_gene) ## what is the largest amount of these nucseqs we could preserve and still get cdr3_protseq max_v_germline = 0 len_v_nucseq = len(v_nucseq) max_j_germline = 0 len_j_nucseq = len(j_nucseq) len_cdr3_protseq = len(cdr3_protseq) len_cdr3_nucseq = len(cdr3_nucseq) if nucleotide_match: if allow_early_nucseq_mismatches: mismatch_score = default_mismatch_score_for_cdr3_nucseq_probabilities else: mismatch_score = -100 max_v_germline = count_matches(v_nucseq, cdr3_nucseq, mismatch_score) max_j_germline = count_matches(''.join(reversed(list(j_nucseq))), ''.join(reversed(list(cdr3_nucseq))), mismatch_score) if allow_early_nucseq_mismatches: ## obliterate the mismatches now max_v, max_j = max_v_germline, max_j_germline if max_v + max_j > len(cdr3_nucseq): ## some overlap! extra = max_v + max_j - len(cdr3_nucseq) #print 'TRIM extra',extra fake_v_trim = extra / 2 ## now dterministic fake_j_trim = extra - fake_v_trim max_v -= fake_v_trim max_j -= fake_j_trim old_cdr3_nucseq = cdr3_nucseq[:] cdr3_nucseq = v_nucseq[:max_v] + \ cdr3_nucseq[ max_v : len_cdr3_nucseq-max_j ] + \ j_nucseq[len_j_nucseq-max_j:] if old_cdr3_nucseq != cdr3_nucseq: Log('{} early_cdr3a_nucseq_mismatch: {} {} before {} after {}'. format(theid, v_gene, j_gene, old_cdr3_nucseq, cdr3_nucseq)) assert len(cdr3_nucseq) == len(old_cdr3_nucseq) else: for i in range(len(v_nucseq)): i_aa = i / 3 ## which aa do we code for? len_codon = (i % 3) + 1 if i_aa >= len(cdr3_protseq): break start = 3 * i_aa codon = v_nucseq[start:start + len_codon] target_aa = cdr3_protseq[i_aa] matched = False for c in reverse_genetic_code[target_aa]: if c.startswith(codon): matched = True if verbose: print 'V', codon, target_aa, matched if matched: max_v_germline = i + 1 else: break ## how about J? for i in range(len_j_nucseq): i_aa = i / 3 ## which aa do we code for? len_codon = (i % 3) + 1 if i_aa >= len(cdr3_protseq): break end = len(j_nucseq) - 3 * i_aa codon = j_nucseq[max(0, end - len_codon):end] target_aa = cdr3_protseq[len_cdr3_protseq - 1 - i_aa] matched = False for c in reverse_genetic_code[target_aa]: if c.endswith(codon): matched = True if verbose: print 'J', codon, target_aa, matched if matched: max_j_germline = i + 1 else: break min_insert = 3 * len_cdr3_protseq - max_v_germline - max_j_germline if verbose: print 'max_v_germline:', max_v_germline, len_v_nucseq, v_nucseq, cdr3_nucseq print 'max_j_germline:',max_j_germline, len_j_nucseq, j_nucseq, cdr3_nucseq, \ all_genes[organism][j_gene].protseq print 'min_insert:', min_insert, max_v_germline, max_j_germline total_prob = 0.0 min_extra_trim = max(0, -1 * min_insert) for extra_trim in range(min_extra_trim, 100): old_total_prob = total_prob total_prob_this_trim = 0.0 for extra_v_trim in range(0, extra_trim + 1): extra_j_trim = extra_trim - extra_v_trim v_trim = len_v_nucseq - max_v_germline + extra_v_trim j_trim = len_j_nucseq - max_j_germline + extra_j_trim if v_trim > len_v_nucseq or j_trim > len_j_nucseq: continue n_insert = min_insert + extra_v_trim + extra_j_trim n_nucseq = v_nucseq[:len_v_nucseq - v_trim] + 'n' * n_insert + j_nucseq[j_trim:] assert len(n_nucseq) == 3 * len_cdr3_protseq if nucleotide_match: coding_prob = 0.25**n_insert else: coding_prob = get_coding_probability(n_nucseq, cdr3_protseq) trim_prob = tcr_rearrangement.get_alpha_trim_probs( organism, v_trim, j_trim, n_insert) total_prob_this_trim += coding_prob * trim_prob total_prob += coding_prob * trim_prob if verbose: print 'coding_prob:', cdr3_protseq, v_trim, j_trim, n_insert, total_prob, coding_prob, trim_prob, n_nucseq if extra_trim > 2 and total_prob_this_trim < error_threshold * old_total_prob: break if return_final_cdr3_nucseq: return total_prob, cdr3_nucseq else: return total_prob
if ( a == '*' and not allow_stop_codons) or ( a == 'X' and not allow_X ): Log('{} skipping: badseq: {} {}'.format(theid, cdr3a_protseq,cdr3b_protseq)) skip_me = True break if skip_me: continue ## probs are computed by reps va_reps = l['va_reps'].split(';') ja_reps = l['ja_reps'].split(';') va_countreps = l['va_countreps'].split(';') ja_countreps = l['ja_countreps'].split(';') va_cdr3_nucseq = tcr_sampler.get_v_cdr3_nucseq( organism, va_gene ) ja_cdr3_nucseq = tcr_sampler.get_j_cdr3_nucseq( organism, ja_gene ) va_cdr3_protseq,codons = get_translation( va_cdr3_nucseq, '+1' ) ja_cdr3_protseq,codons = get_translation( ja_cdr3_nucseq, '+{}'.format(1+len(ja_cdr3_nucseq)%3)) if no_probabilities or not tcr_rearrangement.probs_data_exist( organism,'A'): ##all probabilities will be set to 1 if this flag is set aprob_nucseq = 1 aprob_protseq = 1 else: aprob_nucseq,new_cdr3a_nucseq = tcr_sampler.alpha_cdr3_protseq_probability( theid, organism, va_gene, ja_gene, cdr3_protseq='', cdr3_nucseq=cdr3a_nucseq, verbose=verbose, return_final_cdr3_nucseq=True ) if new_cdr3a_nucseq != cdr3a_nucseq: ## note note note print 'new_cdr3a_nucseq:',len(new_cdr3a_nucseq),new_cdr3a_nucseq print 'old_cdr3a_nucseq:',len(cdr3a_nucseq),cdr3a_nucseq
def parse_unpaired_dna_sequence_blastn(organism, ab, blast_seq, info, verbose, nocleanup, hide_nucseq, extended_cdr3, return_all_good_hits=False, max_bit_score_delta_for_good_hits=50, max_missing_aas_at_cdr3_cterm=2): ## make this a little more unique blast_tmpfile = 'tmp%d%s%s%f%s.fa' % (len(blast_seq), organism, ab, random.random(), blast_seq[:3]) #print 'blast_tmpfile:',blast_tmpfile #assert not exists(blast_tmpfile) genes = ('UNK', 'UNK', [100, 0], 'UNK', 'UNK', [100, 0], '-') status = [] evalues = {'V' + ab: (1, 0), 'J' + ab: (1, 0)} all_good_hits_with_scores = [[], []] if verbose: print 'blast_seq:', info, ab, blast_seq if len(blast_seq) <= 20: status.append('short_{}_blast_seq_{}'.format(ab, len(blast_seq))) else: out = open(blast_tmpfile, 'w') out.write('>tmp\n%s\n' % blast_seq) out.close() ## now blast against V and J top_hits = [] for ivj, vj in enumerate('VJ'): dbfile = get_blast_nucseq_database( organism, ab, vj) # also ensures that it exists assert exists(dbfile) blastall_exe = path_to_blast_executables + '/blastall' assert exists(blastall_exe) cmd = '%s -F F -p blastn -i %s -d %s -v 100 -b 1 -o %s.blast'\ %( blastall_exe, blast_tmpfile, dbfile, blast_tmpfile ) #print cmd system(cmd) if verbose: print 'blast:', info, ab, vj, '=' * 50 print ''.join(open(blast_tmpfile + '.blast', 'r').readlines()) print '=' * 80 ## try parsing the results evalue_threshold = 1e-1 identity_threshold = 20 hits = blast.parse_blast_alignments(blast_tmpfile + '.blast', evalue_threshold, identity_threshold) hits_scores = get_all_hits_with_evalues_and_scores( blast_tmpfile + '.blast') ## id,bitscore,evalue if hits and hits[hits.keys()[0]]: top_hit = hits[hits.keys()[0]][0] top_id, top_bit_score, top_evalue = hits_scores[0] all_good_hits_with_scores[ivj] \ = [ x for x in hits_scores if top_bit_score-x[1] <= max_bit_score_delta_for_good_hits ] assert top_hit.hit_id == top_id ## figure out the score gap to the next non-equivalen bit_score_gap = top_bit_score top_rep = all_genes[organism][top_id].rep for (id, bit_score, evalue) in hits_scores[1:]: if all_genes[organism][id].rep != top_rep: bit_score_gap = top_bit_score - bit_score break evalues[vj + ab] = (top_hit.evalue, bit_score_gap) top_hits.append(top_hit) else: status.append('no_{}{}_blast_hits'.format(vj, ab)) if len(top_hits) == 2: ## hits in both v and j v_hit = top_hits[0] j_hit = top_hits[1] v_gene = v_hit.hit_id j_gene = j_hit.hit_id v_rep = all_genes[organism][v_gene].rep j_rep = all_genes[organism][j_gene].rep v_nucseq = all_fasta[organism][ab]['V'][nuc][v_hit.hit_id] j_nucseq = all_fasta[organism][ab]['J'][nuc][j_hit.hit_id] v_protseq = all_fasta[organism][ab]['V'][prot][v_hit.hit_id] ## this might fail if these guys are pseudo-genes... ## so filter out the non-aa-matching j genes... ## v_hitseq_frame = all_offsets[organism][ab]['V'][v_hit.hit_id] j_hitseq_frame = all_offsets[organism][ab]['J'][j_hit.hit_id] ## tricky if the hits are on different strands! ## assert v_hit.q_strand == 1 ## I think this is the blastn convention... assert j_hit.q_strand == 1 if v_hit.h_strand != j_hit.h_strand: Log( ` ('ERR V/J strand mismatch:', v_hit.h_strand, v_hit.evalue, j_hit.h_strand, j_hit.evalue) `) genes = (v_gene.replace('TRAV', 'TRaV').replace('TRBV', 'TRbV'), v_rep.replace('TRAV', 'TRaV').replace('TRBV', 'TRbV'), [100, 0], j_gene.replace('TRAJ', 'TRaJ').replace('TRBJ', 'TRbJ'), j_rep.replace('TRAJ', 'TRaJ').replace('TRBJ', 'TRbJ'), [100, 0], '-') status.append('vj_{}_strand_mismatch'.format(ab)) else: v_q2hmap = v_hit.q2hmap j_q2hmap = j_hit.q2hmap if v_hit.h_strand == -1: ## switch stuff around... ## have to mess with the alignment too v_q2hmap = reverse_q2hmap(blast_seq, v_nucseq, v_hit) j_q2hmap = reverse_q2hmap(blast_seq, j_nucseq, j_hit) blast_seq = logo_tools.reverse_complement(blast_seq) if verbose: print 'reverse-comp blast_seq:', ab q_vframes = {} for qpos, (vpos, vna) in v_q2hmap.iteritems(): if vpos >= 0: f = (qpos - vpos + v_hitseq_frame) % 3 q_vframes[f] = q_vframes.get(f, 0) + 1 q_vframe = max([(count, x) for x, count in q_vframes.iteritems()])[1] q_jframes = {} for qpos, (jpos, jna) in j_q2hmap.iteritems(): if jpos >= 0: f = (qpos - jpos + j_hitseq_frame) % 3 q_jframes[f] = q_jframes.get(f, 0) + 1 q_jframe = max([(count, x) for x, count in q_jframes.iteritems()])[1] #q_frame_vstart = ( v_hitseq_frame + v_hit.q_start - v_hit.h_start )%3 #q_frame_jstart = ( j_hitseq_frame + j_hit.q_start - j_hit.h_start )%3 ## construct a protein sequence alignment between translation of blast_seq and q2v_align = {} for qpos, (vpos, vna) in sorted(v_q2hmap.iteritems()): if vpos >= 0: f = (qpos - vpos + v_hitseq_frame) % 3 if f != q_vframe: continue v_protpos = (vpos - v_hitseq_frame) / 3 q_protpos = (qpos - q_vframe) / 3 if q_protpos in q2v_align: if q2v_align[q_protpos] != v_protpos: Log('indel?? {} {} {}'.format( organism, ab, info)) q2v_align[q_protpos] = v_protpos ## this could be aligning a position that's not actually in the translated protein ## sequence if there are 1 or 2 nucleotides at the end... if q_vframe != q_jframe: ## out of frame Log( ` ('ERR frame mismatch:', q_vframe, v_hit.evalue, q_jframe, j_hit.evalue) `) if verbose: print 'frame mismatch', q_vframe, q_jframe # genes = ( v_gene.replace('TRAV','TRaV' ).replace('TRBV','TRbV'), # v_rep .replace('TRAV','TRaV' ).replace('TRBV','TRbV'), [100,0], # j_gene.replace('TRAJ','TRaJ' ).replace('TRBJ','TRbJ'), # j_rep .replace('TRAJ','TRaJ' ).replace('TRBJ','TRbJ'), [100,0], '-' ) status.append('vj_{}_frame_mismatch'.format(ab)) ## fiddle with blast_seq ## for each 'extra' nucleotide inserted between v and j, add two '#' characters after the nucleotide last_v_align_pos = max(v_q2hmap.keys()) first_j_align_pos = min(j_q2hmap.keys()) ## add some '#' characters to blast_seq to get V and J back into frame num_to_insert = (q_vframe - q_jframe) % 3 insertpos = max(last_v_align_pos + 1, (last_v_align_pos + first_j_align_pos) / 2) blast_seq = blast_seq[: insertpos] + '#' * num_to_insert + blast_seq[ insertpos:] # num_inserted_nucleotides = (q_jframe - q_vframe)%3 # new_blast_seq = blast_seq[:last_q_align_pos+1] # extra_seq = blast_seq[last_q_align_pos+1:] # for i in range(num_inserted_nucleotides): # new_blast_seq += extra_seq[0] + '##' # extra_seq = extra_seq[1:] # new_blast_seq += extra_seq # blast_seq = new_blast_seq[:] qseq, codons = get_translation(blast_seq, '+%d' % (q_vframe + 1)) cdr3, v_mm, j_mm, errors = parse_cdr3.parse_cdr3( organism, ab, qseq, v_hit.hit_id, j_hit.hit_id, q2v_align, extended_cdr3=extended_cdr3, max_missing_aas_at_cdr3_cterm=max_missing_aas_at_cdr3_cterm ) if verbose: print 'cdr3:', ab, cdr3, cdr3 in qseq, 'q_vframe:', q_vframe status.extend(errors) if cdr3 != '-': ## the cdr3 sequence should be contained in qseq, unless qseq was missing 1-2 rsds at cterm if not hide_nucseq: if cdr3 in qseq: ## the old way, without any missing C-term rsds of CDR3 offset = qseq.find(cdr3) cdr3_codons = codons[offset:offset + len(cdr3)] cdr3 += '-' + ''.join(cdr3_codons) else: num_missing_cterm_aas = 1 while num_missing_cterm_aas < max_missing_aas_at_cdr3_cterm and \ cdr3[:-1*num_missing_cterm_aas] not in qseq: num_missing_cterm_aas += 1 assert cdr3[:-1 * num_missing_cterm_aas] in qseq ## this is a nuisance... assert extended_cdr3 # it's the new default anyhow jg = all_genes[organism][j_hit.hit_id] j_nucseq = jg.nucseq j_cdr3len = len(jg.cdrs[0].replace( gap_character, '')) j_cdr3_nucseq = jg.nucseq[:jg.nucseq_offset + 3 * j_cdr3len] missing_nucseq = j_cdr3_nucseq[ -3 * num_missing_cterm_aas:] offset = qseq.find(cdr3[:-1 * num_missing_cterm_aas]) cdr3_codons = codons[offset:offset + len(cdr3) - num_missing_cterm_aas] cdr3_nucseq = ''.join(cdr3_codons) + missing_nucseq assert len(cdr3_nucseq) == 3 * len(cdr3) assert get_translation(cdr3_nucseq, '+1')[0] == cdr3 cdr3 += '-' + cdr3_nucseq # if verbose: # cdr3_nucseq = ''.join( cdr3_codons ).upper() # nucseq_startpos = 3*offset + q_vframe # alt_nucseq = blast_seq[ nucseq_startpos:nucseq_startpos+len(cdr3_nucseq) ] # rc1 = logo_tools.reverse_complement(cdr3_nucseq) # rc2 = logo_tools.reverse_complement(blast_seq) # print 'cdr3_nucseq',ab,offset,cdr3_nucseq,cdr3_nucseq in blast_seq,\ # blast_seq.index(cdr3_nucseq),alt_nucseq,rc1 in rc2 if '#' in blast_seq: ## sign of out-of-frame v_gene = v_gene.replace('TRAV', 'TRaV').replace('TRBV', 'TRbV') v_rep = v_rep.replace('TRAV', 'TRaV').replace('TRBV', 'TRbV') j_gene = j_gene.replace('TRAJ', 'TRaJ').replace('TRBJ', 'TRbJ') j_rep = j_rep.replace('TRAJ', 'TRaJ').replace('TRBJ', 'TRbJ') protseq, nucseq = cdr3.split('-') if protseq and nucseq: if protseq.count('#') != nucseq.count('#'): assert nucseq.count('#') == 2 assert protseq.count('#') == 1 protseq = protseq.replace('#', '##') cdr3 = '{}-{}'.format(protseq, nucseq) genes = (v_gene, v_rep, v_mm, j_gene, j_rep, j_mm, cdr3) if cdr3 != "-": cdr3aa = cdr3.split("-")[0] if len(cdr3aa) < 5: status.append('cdr3{}_len_too_short'.format(ab)) if not nocleanup: files = glob(blast_tmpfile + '*') for file in files: remove(file) assert len(genes) == 7 if return_all_good_hits: return genes, evalues, status, all_good_hits_with_scores ## status is a list, maybe be empty else: return genes, evalues, status ## status is a list, maybe be empty
if len(cdr3_nucseq) % 3: if woof: print 'OOF {} {} {} {} {:d} {:d} {} {} {} {}:{}:{}'\ .format( v_gene, v_rep, j_gene, j_rep, v_score, j_score, ','.join(all_v_genes), ','.join(all_j_genes), cdr3_nucseq, logfile, fastq_file, seqid ) continue if len(cdr3_nucseq) / 3 < min_cdr3_len: continue ## in frame cdr3_protseq, codons = get_translation(cdr3_nucseq, '+1') if '*' in cdr3_protseq or 'X' in cdr3_protseq: continue original_cdr3_nucseq = cdr3_nucseq[:] original_cdr3_protseq = cdr3_protseq[:] if chain == 'A': if correct_cdr3_seqs: tmp_results = tcr_sampler.analyze_junction\ ( organism, v_gene, j_gene, cdr3_protseq, cdr3_nucseq, return_corrected_cdr3_seqs = True, mismatch_score = mismatch_score_for_correcting_cdr3_seqs ) corrected_cdr3_nucseq, corrected_cdr3_protseq = list( tmp_results)[-2:]