warnings.warn( "Incomplete TCR information - need V/J/CDR3 as minimum." ) else: tcr_bits = fxn.autofill_input(tcr_bits, c) tcr_bits = fxn.tweak_thimble_input( tcr_bits, input_args) try: out_list, stitched = st.stitch( tcr_bits, c, tcr_dat[c], tcr_functionality[c], codons) sorted_row_bits[c + '_nt'] = stitched sorted_row_bits[ c + '_aa'] = fxn.translate_nt(stitched) sorted_row_bits.update( dict( list( zip([ c + x for x in stitch_list_fields ], out_list)))) except Exception as message: sorted_row_bits[ 'Warnings/Errors'] += '(' + c + ') ' + str( message) sorted_row_bits[ 'Warnings/Errors'] += 'Cannot stitch a sequence for ' + c + '. '
# Get input arguments, determine the TCR chain in use, get codon table, then load the IMGT data in fxn.check_scripts_dir() input_args, chain, codons = fxn.sort_input(vars(args())) imgt_dat, tcr_functionality = fxn.get_imgt_data(chain, gene_types, input_args['species']) out_list, stitched = stitch(input_args, chain, imgt_dat, tcr_functionality, codons) out_str = '|'.join(out_list) + '(L)' print( '----------------------------------------------------------------------------------------------' ) print(fxn.fastafy('nt|' + out_str, stitched)) print(fxn.fastafy('aa|' + out_str, fxn.translate_nt(stitched))) # If a known/partial amino acid sequence provided, ensure they match up with a quick printed alignment if 'aa' in input_args: from Bio import pairwise2 from Bio.pairwise2 import format_alignment alignments = pairwise2.align.globalxx(input_args['aa'], fxn.translate_nt(stitched)) for i in range(0, 600, 60): print('\n') if i > len(alignments[0][0]): break for y in [ x[i:i + 60] for x in format_alignment(*alignments[0]).split('\n')[:3] ]:
def stitch(specific_args, tcr_info, functionality, partial_info, codon_dict, j_warning_threshold, preferences): """ Core function, that performs the actual TCR stitching :param specific_args: basic input arguments of a given rearrangement (e.g. V/J/CDR3) :param tcr_info: sequence data for the alleles of a specific locus read in from IMGT data :param functionality: predicted functionality of different TCR genes, as according to IMGT :param partial_info: genes filtered out from input TCR data on account of being incomplete in the database :param codon_dict: dictionary of which codons to use for which amino acids :param j_warning_threshold: int threshold value, if a J substring length match is shorter it will throw a warning :param preferences: nested dict of preferred alleles, like the tcr_info dict but one level shallower :return: list of details of the TCR as constructed, plus the stitched together nucleotide sequence """ # Find each of the appropriate sequences done = {} used_alleles = {} for r in fxn.regions: # First establish what the input gene and allele values are gene_allele_in = specific_args[r] if '*' not in gene_allele_in: gene_allele_in += '*' in_gene, in_allele = gene_allele_in.split('*') gene, allele = '', '' # First check whether the gene exists in the IMGT data for this species if in_gene in tcr_info[fxn.regions[r]]: gene = in_gene else: # If it's a leader sequence, it might be a user-defined DNA sequence if r == 'l' and fxn.dna_check(specific_args['l']): # If it is, add that info in... used_alleles[r] = 'UserSpecifiedLeader*' + specific_args['l'] done[r] = specific_args['l'] # Check it's likely to translate in frame if len(specific_args['l']) % 3 != 0: warnings.warn( "User specified leader sequence is not evenly divisible by 3 - " "stitched TCR frame will likely be wrong. ") # ...and jump ahead to the stitching (skipping irrelevant TCR gene checks) continue raise ValueError( "Error: a " + fxn.regions[r] + " sequence region has not been found for gene " + gene + " in the IMGT data for this chain/species. Please check your TCR and species data. " ) # Having established the gene, then need to determine the allele if in_allele: # If an allele is provided, check whether it exists in the database for this gene and isn't partial if in_allele in tcr_info[fxn.regions[r]][gene]: if partial_info[gene][allele]: warnings.warn("Cannot use " + gene + "*" + in_allele + " " + fxn.regions[r].lower() + "sequence, " "as it is classed as '" + partial_info[gene][in_allele] + "' (partial). ") else: allele = in_allele else: warnings.warn("Cannot find sequence for requested allele, " + gene + "*" + in_allele + " for the " + fxn.regions[r].lower() + " sequence in the input FASTA data. ") # If no allele supplied (or is supplied but invalid) then use 1) a preferred allele or 2) the prototypical *01 if not allele: warnings.warn("No valid " + fxn.regions[r].lower() + " region allele determined yet for " + gene + ". ") allele = '01' if preferences: if gene in preferences[fxn.regions[r]]: allele = preferences[fxn.regions[r]][gene] warnings.warn( "Defaulting to allele *" + allele + " for the " + fxn.regions[r].lower() + " sequence, as specified in the preferred allele file. " ) # NB: we don't have to worry about partial alleles in the preference list, as those are filtered out else: warnings.warn( "Defaulting to *01, as " + gene + " isn't specified in the preferred allele file for" " the " + fxn.regions[r].lower() + " region. ") else: warnings.warn( "Defaulting to *01 for the " + fxn.regions[r].lower() + " region, " "in the absence of a preferred allele file being specified. " ) # Filag up a warning if indeed *01 has defaulted on when there are other alleles available for that gene if allele == '01': if len(tcr_info[fxn.regions[r]][gene]) > 1: # Just check that at least one of those other alleles is not partial for other_allele in tcr_info[fxn.regions[r]][gene]: if other_allele != '01': if not partial_info[gene][other_allele]: warnings.warn( "NB: the prototypical '*01' allele is being used for the " + fxn.regions[r].lower() + " region by default, but other alleles are " "available - consider double checking the right allele is asked for. " ) continue # Catchall double check both gene and allele are sorted if gene and allele: done[r] = tcr_info[fxn.regions[r]][gene][allele] used_alleles[r] = gene + '*' + allele func_err_base = "Warning: gene " + gene + '*' + allele + " has a IMGT-assigned functionality of \'" \ + functionality[gene][allele] + "\', " # Check functionality if fxn.strip_functionality(functionality[gene][allele]) != 'F': warnings.warn( func_err_base + "and thus may not express or function correctly. ") # Special check to account for IMGT not including the 3' terminal J residue if allele from cDNA if functionality[gene][allele] == '(F)' and r == 'j': cdna_j_err = func_err_base + "meaning it was only detected in cDNA, and thus IMGT doesn't record its" \ " 3\' terminal nucleotide - " # If this is an (F) call for a prototypical allele we have to skip, as there's no reference to go off if allele == '01': raise IOError( cdna_j_err + " unable to fix as this is the prototypical allele (*01). " ) # Otherwise, use the 01 terminal residue else: done[r] = done[r] + tcr_info[ fxn.regions[r]][gene]['01'][-1] warnings.warn( cdna_j_err + "substituting the *01 allele terminal base to maintain reading frame. " ) else: raise ValueError("Cannot find TCR sequence data for " + r.upper() + " gene: " + in_gene + '*' + in_allele + ". ") # Get information about the C-terminal residue of the CDR3 *should* be, given that J gene j_residues, low_confidence_js = fxn.get_j_motifs(specific_args['species']) # And the motifs required for the correct frame inference and delineation of the constant region sequences c_motifs = fxn.get_c_motifs(specific_args['species']) # Throw a warning if the J gene is one in which the C-terminal residue cannot be confidently identified if used_alleles['j'] in low_confidence_js: warnings.warn("Warning: " + used_alleles['j'] + " has a \'low confidence\' CDR3-ending motif. ") # Then determine whether CDR3 has been provided in amino or nucleic acid form if fxn.dna_check(specific_args['cdr3']): input_type = 'nt' specific_args['cdr3_nt'] = specific_args['cdr3'] warnings.warn("CDR3 junction provided as DNA sequence: \'" + specific_args['cdr3_nt'] + '\'. ') else: input_type = 'aa' # Determine whether seamless integrated is requested (in platform independent way) seamless = False if 'seamless' in specific_args: if specific_args['seamless']: seamless = True # Get the germline encoded bits n_term_nt_raw = done['l'] + done['v'] c_term_nt_raw = done['j'] + done['c'] # Run the appropriate form of non-templated integration # First test eligibility for seamless integration if input_type == 'nt' and seamless: warnings.warn( "Seamless option selected: stitched sequence may not be accurate is nucleotide sequence provided is too " "short or contains polymorphisms or errors relative to the chosen genes/alleles near the edges. " ) # Optimistic warning to ensure additional sequence provided for seamless stitching if fxn.translate_nt(specific_args['cdr3_nt'][:3]) == 'C': warnings.warn( "Cys residue detected in first codon of provided seamless CDR3 junction: " "note that seamless stitching requires additional sequence 5\' of the start of the CDR3 " "(disregard this warning if this is not the CDR3 starting residue). " ) n_term_nt_trimmed, v_overlap = fxn.find_v_overlap( n_term_nt_raw, specific_args['cdr3_nt']) # Check for suspiciously short V gene overlaps if len(v_overlap) < 10: warnings.warn("Only short V gene overlap detected (" + v_overlap + ") for seamless stitching. ") # Most common cause = unexpected polymorphism (e.g. SNP or PCR error) in the 5' of the padding sequence # Try the overlap search again, starting from one position upstream of the previous match n_term_nt_trimmed_2, v_overlap_2 = fxn.find_v_overlap( n_term_nt_raw, specific_args['cdr3_nt'][len(v_overlap) + 1:]) if len(v_overlap_2) > 10: warnings.warn( "A longer (" + str(len(v_overlap_2)) + " nt) overlap was found after trimming the " "short overlap +1 off, and stitching continued (NB: presumed SNP or PCR error). " ) n_term_nt_trimmed = n_term_nt_trimmed_2[:-(len(v_overlap) + 1)] v_overlap = specific_args['cdr3_nt'][:len(v_overlap) + len(v_overlap_2) + 1] else: raise ValueError( "No longer overlap was found even after trimming that short overlap + 1. Please check " "the V gene call and that the CDR3 padding sequence doesn't contain polymorphisms. " ) c_term_nt_trimmed = fxn.find_j_overlap( specific_args['cdr3_nt'][len(v_overlap):], c_term_nt_raw) stitched_nt = n_term_nt_trimmed + specific_args[ 'cdr3_nt'] + c_term_nt_trimmed # Use the tidy_c_term functionality to frame check/trim excess stitched_nt, stitched_trans = fxn.tidy_c_term(stitched_nt, False, c_motifs, used_alleles['c']) # Catch more 5' SNP errors: if there's a SNP in the edges of the contextual padding this can cause and indel, # ... which means tidy_c_term will trim some nt from the 5' of the gene, which we can use to trigger an IOError if not stitched_nt.startswith(done['l']): raise ValueError( "An indel has been detected during seamless stitching, which is usually caused by " "polymorphisms in the padding sequence relative to the genes selected: please either " "ensure selected alleles are correct or provide more context beyond any polymorphisms. " ) # Otherwise run regular amino-acid based germline determination else: # If an exact nucleotide junction is provided, first translate if input_type == 'nt': specific_args['cdr3'] = fxn.translate_nt(specific_args['cdr3_nt']) # Frame check (if not providing extra context for seamless integration) if len(specific_args['cdr3_nt']) % 3 != 0 and not seamless: warnings.warn( "Warning: length of CDR3 DNA sequence provided is not evenly divisible by 3 " "and seamless stitching not selected: stitched TCR frame will likely be wrong. " ) # And check that users haven't asked for amino acid/seamless, which won't work elif seamless: raise IOError( "The seamless option has been selected, yet provided CDR3-containing sequence is not DNA. " ) # Get codon data, and use to check that there's no unexpected characters in the CDR3 if len([ x for x in list(set([x for x in specific_args['cdr3']])) if x not in list(codon_dict.keys()) ]) > 0: raise ValueError( "Unexpected character in CDR3 string. " "Please use only one-letter standard amino acid designations. " ) # Then check the C-terminus of the CDR3 has an appropriate residue # (putting the default F in the dict if not there) if used_alleles['j'] not in j_residues: j_residues[used_alleles['j']] = 'F' if specific_args['cdr3'][-1] != j_residues[used_alleles['j']]: warnings.warn( "CDR3 provided does not end with the expected residue for this J gene (" + j_residues[used_alleles['j']] + "). Deletion this far in to the J is extremely unlikely. ") # Tidy up the germline edges to be coding for whole codons without any remainders n_term_nt_inframe, n_term_aa = fxn.tidy_n_term(n_term_nt_raw) c_term_nt_inframe, c_term_aa = fxn.tidy_c_term( c_term_nt_raw, specific_args['skip_c_checks'], c_motifs, used_alleles['c']) # Figure out where the AA CDR3 will slot in: look at the CDR3 edges & see how much overlap needs to be removed # Start with 4 residues chunks, move from end of V gene up to 10 residues in (very generous deletion allowance) n_term_nt_trimmed, cdr3_n_offset = fxn.determine_v_interface( specific_args['cdr3'], n_term_nt_inframe, n_term_aa) c_term_nt_trimmed, cdr3_c_end = fxn.determine_j_interface( specific_args['cdr3'][cdr3_n_offset:], c_term_nt_inframe, c_term_aa, len(done['j']), j_warning_threshold) # Generate the non-templated sequences using either supplied nucleotides or common codons established earlier if input_type == 'nt': non_templated_nt = specific_args['cdr3_nt'][cdr3_n_offset * 3:(cdr3_n_offset + cdr3_c_end) * 3] else: non_templated_aa = specific_args['cdr3'][ cdr3_n_offset:cdr3_n_offset + cdr3_c_end] non_templated_nt = fxn.rev_translate(non_templated_aa, codon_dict) # Then finally stitch all that info together and output! stitched_nt = n_term_nt_trimmed + non_templated_nt + c_term_nt_trimmed # If optional 5'/3' sequences are specified, add them to the relevant place if specific_args['5_prime_seq']: stitched_nt = specific_args['5_prime_seq'] + stitched_nt # Translation offset allows simple translation of output NT without having to figure out the frame transl_offset = 3 - (len(specific_args['5_prime_seq']) % 3) else: transl_offset = 0 if specific_args['3_prime_seq']: stitched_nt += specific_args['3_prime_seq'] # Then finally stitch all that info together and output! out_bits = [ specific_args['name'], used_alleles['v'], used_alleles['j'], used_alleles['c'], specific_args['cdr3'], used_alleles['l'] + '(L)' ] # TODO add information to output header if additional 5'/3' sequences specified? return out_bits, stitched_nt, transl_offset
# If no TCR features present, just skip continue elif len(featured_bits) in [1, 2]: warnings.warn("Incomplete TCR information - need V/J/CDR3 as minimum.") else: tcr_bits = fxn.autofill_input(populate_blanks(tcr_bits, pre_stitch_list_fields), c) tcr_bits = fxn.tweak_thimble_input(tcr_bits) try: out_list, stitched, offset = st.stitch(tcr_bits, tcr_dat[c], tcr_functionality[c], partial, codons, input_args['j_warning_threshold'], preferences[c]) sorted_row_bits[c + '_nt'] = stitched sorted_row_bits[c + '_aa'] = fxn.translate_nt('N' * offset + stitched) sorted_row_bits.update(dict(list(zip([c + x for x in post_stitch_list_fields], out_list)))) except Exception as message: sorted_row_bits['Warnings/Errors'] += '(' + c + ') ' + str(message) sorted_row_bits['Warnings/Errors'] += 'Cannot stitch a sequence for ' + c + '. ' # Store all chain related warning messages too, in same field, ignoring irrelevant errors sorted_row_bits['Warnings/Errors'] += ' '.join( ['(' + c + ') ' + str(chain_warnings[x].message) for x in range(len(chain_warnings)) if 'DeprecationWarning' not in str(chain_warnings[x].category)]) with warnings.catch_warnings(record=True) as link_warnings: warnings.simplefilter("always")
"Unknown output mode detected: " + input_args['mode'] + ". \n" "Should be one of 'BOTH_FA' (default), 'AA_FA', 'NT_FA', 'AA', 'NT'." ) if '_FA' in input_args['mode']: print( '----------------------------------------------------------------------------------------------' ) if input_args['mode'] == 'BOTH_FA' or input_args['mode'] == 'NT_FA': print(fxn.fastafy('nt|' + out_str, stitched)) if input_args['mode'] == 'BOTH_FA' or input_args['mode'] == 'AA_FA': # Use the offset to 5' pad the stitched sequence with 'N's to make up for non-codon length 5' added seqs print( fxn.fastafy('aa|' + out_str, fxn.translate_nt('N' * offset + stitched))) elif input_args['mode'] == 'NT': print(stitched) elif input_args['mode'] == 'AA': print(fxn.translate_nt('N' * offset + stitched)) # If a known/partial amino acid sequence provided, ensure they match up with a quick printed alignment if input_args['aa']: from Bio import pairwise2 from Bio.pairwise2 import format_alignment alignments = pairwise2.align.globalxx( input_args['aa'], fxn.translate_nt('N' * offset + stitched)) for i in range(0, 600, 60): print('\n')