def get_TRBC(FASTA_DIR): # get the sequence representation for each TRA and TRB gene segment # I'll just select the first allele for each sequence first trbc_fasta = "/".join([FASTA_DIR, "TRBC.fasta"]) trb_c = Fasta.Fasta(trbc_fasta) trb_c_dict = {} trb_c_set = set() trb_count = 0 for _fas in trb_c: # parse the header # select BL6 parse_head = _fas.header.split("|") trb_gene = parse_head[1].split("*")[0] trb_pos = parse_head[5] if trb_gene not in trb_c_set: trb_c_set.add(trb_gene) trb_c_dict[trb_gene] = { "Sequence": _fas.sequence, "Gene": trb_gene, "Length": _fas.length } trb_count += 1 return (trb_c_dict)
def create_dataset(n_success, fasta_dir, outfile): ''' Create a data set of `n_success` valid TCRs, record valid and failed chains ''' valid_counter = 0 alpha_v = [] beta_v = [] alpha_len = [] beta_len = [] alpha_diversity = [] beta_diversity = [] alpha_j = [] beta_j = [] alpha_aa = [] beta_aa = [] alpha_pep = [] beta_pep = [] alpha_keep = [] beta_keep = [] alpha_cdr1 = [] alpha_cdr2 = [] alpha_cdr3 = [] beta_cdr1 = [] beta_cdr2 = [] beta_cdr3 = [] alpha_cdr1_diversity = [] alpha_cdr2_diversity = [] alpha_cdr3_diversity = [] beta_cdr1_diversity = [] beta_cdr2_diversity = [] beta_cdr3_diversity = [] valid_tcr_chains = [] with open(outfile, "wt") as ofile: ofile.write( "TRA_V\tTRA_J\tTRA_len\tTRA_entropy\tTRA_AA\tTRA_Peptide\tTRA_Valid\tTRA_CDR1_entropy\tTRA_CDR1\tTRA_CDR2_entropy\tTRA_CDR2\tTRA_CDR3_entropy\tTRA_CDR3\tTRB_V\tTRB_J\tTRB_len\tTRB_entropy\tTRB_AA\tTRB_Peptide\tTRB_Valid\tTRB_CDR1_entropy\tTRB_CDR1\tTRB_CDR2_entropy\tTRB_CDR2\tTRB_CDR3_entropy\tTRB_CDR3\tValid.TCR\n" ) while valid_counter < n_success: tcr_chains = generate_TCR(1, fasta_dir) for x in tcr_chains: tcra = x['TCRA'] tcrb = x['TCRB'] alpha_v.append(tcra['V']) alpha_j.append(tcra['J']) alpha_len.append(len(tcra['RNA'])) # the entropy calculation should be for the CDR3 region or the V(D)J region only # it should also be the translated AA sequence, not the nucleotide alpha_rna_record = Fasta.FastaRecordPeptide( header='', sequence=tcra['AA']) alpha_n_count = pd.Series(alpha_rna_record.counts) alpha_diversity.append( stats.entropy(alpha_n_count / alpha_n_count.sum())) alpha_aa.append(len(tcra['AA'])) alpha_pep.append(tcra['AA']) alpha_keep.append(tcra["Valid.Chain"]) try: # find the valid CDR regions if possible # calculate the CDR entropies alpha_cdr1_aa = get_CDR(tcra['AA'], 1) alpha_aa_cdr1 = Fasta.FastaRecordPeptide( header='', sequence=alpha_cdr1_aa) alpha_cdr1_count = pd.Series(alpha_aa_cdr1.counts) alpha_cdr1_en = stats.entropy(alpha_cdr1_count / alpha_cdr1_count.sum()) alpha_cdr1_diversity.append(alpha_cdr1_en) alpha_cdr2_aa = get_CDR(tcra['AA'], 2) alpha_aa_cdr2 = Fasta.FastaRecordPeptide( header='', sequence=alpha_cdr2_aa) alpha_cdr2_count = pd.Series(alpha_aa_cdr2.counts) alpha_cdr2_en = stats.entropy(alpha_cdr2_count / alpha_cdr2_count.sum()) alpha_cdr2_diversity.append(alpha_cdr2_en) try: alpha_cdr3_aa = get_CDR(tcra['AA'], 3) alpha_aa_cdr3 = Fasta.FastaRecordPeptide( header='', sequence=alpha_cdr3_aa) alpha_cdr3_count = pd.Series(alpha_aa_cdr3.counts) alpha_cdr3_en = stats.entropy(alpha_cdr3_count / alpha_cdr3_count.sum()) alpha_cdr3_diversity.append(alpha_cdr3_en) except UnboundLocalError: alpha_cdr3_aa = "" alpha_cdr3_en = np.nan alpha_cdr3_diversity.append(alpha_cdr3_en) except TypeError: alpha_cdr1_en = np.nan alpha_cdr2_en = np.nan alpha_cdr3_en = np.nan alpha_cdr1_diversity.append(alpha_cdr1_en) alpha_cdr2_diversity.append(alpha_cdr2_en) alpha_cdr3_diversity.append(alpha_cdr3_en) alpha_cdr1.append(alpha_cdr1_aa) alpha_cdr2.append(alpha_cdr2_aa) alpha_cdr3.append(alpha_cdr3_aa) beta_v.append(tcrb['V']) beta_j.append(tcrb['J']) beta_len.append(len(tcrb['RNA'])) beta_rna_record = Fasta.FastaRecordPeptide(header='', sequence=tcrb['AA']) beta_n_count = pd.Series(beta_rna_record.counts) beta_diversity.append( stats.entropy(beta_n_count / beta_n_count.sum())) beta_aa.append(len(tcrb['AA'])) beta_pep.append(tcrb['AA']) beta_keep.append(tcrb["Valid.Chain"]) try: # calculate the CDR entropies beta_cdr1_aa = get_CDR(tcrb['AA'], 1) beta_aa_cdr1 = Fasta.FastaRecordPeptide( header='', sequence=beta_cdr1_aa) beta_cdr1_count = pd.Series(beta_aa_cdr1.counts) beta_cdr1_en = stats.entropy(beta_cdr1_count / beta_cdr1_count.sum()) beta_cdr1_diversity.append(beta_cdr1_en) beta_cdr2_aa = get_CDR(tcrb['AA'], 2) beta_aa_cdr2 = Fasta.FastaRecordPeptide( header='', sequence=beta_cdr2_aa) beta_cdr2_count = pd.Series(beta_aa_cdr2.counts) beta_cdr2_en = stats.entropy(beta_cdr2_count / beta_cdr2_count.sum()) beta_cdr2_diversity.append(beta_cdr2_en) try: beta_cdr3_aa = get_CDR(tcrb['AA'], 3) beta_aa_cdr3 = Fasta.FastaRecordPeptide( header='', sequence=beta_cdr3_aa) beta_cdr3_count = pd.Series(beta_aa_cdr3.counts) beta_cdr3_en = stats.entropy(beta_cdr3_count / beta_cdr3_count.sum()) beta_cdr3_diversity.append(beta_cdr3_en) except UnboundLocalError: beta_cdr3_aa = "" beta_cdr3_en = np.nan beta_cdr3_diversity.append(beta_cdr3_en) except TypeError: beta_cdr1_en = np.nan beta_cdr2_en = np.nan beta_cdr3_en = np.nan beta_cdr1_diversity.append(beta_cdr1_en) beta_cdr2_diversity.append(beta_cdr2_en) beta_cdr3_diversity.append(beta_cdr3_en) beta_cdr1.append(beta_cdr1_aa) beta_cdr2.append(beta_cdr2_aa) beta_cdr3.append(beta_cdr3_aa) # check for validity if tcra["Valid.Chain"] and tcrb["Valid.Chain"]: valid_counter += 1 validchain = True valid_tcr_chains.append(True) else: validchain = False valid_tcr_chains.append(False) ofile.write( "{}\t{}\t{}\t{}\t{}\t\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n" .format(tcra['V'], tcra['J'], len(tcra['RNA']), tcra['RNA'], stats.entropy(alpha_n_count / alpha_n_count.sum()), len(tcra['AA']), tcra['AA'], tcra["Valid.Chain"], alpha_cdr1_en, alpha_cdr1_aa, alpha_cdr2_en, alpha_cdr2_aa, alpha_cdr3_en, alpha_cdr3_aa, tcrb['V'], tcrb['J'], len(tcrb['RNA']), tcrb['RNA'], stats.entropy(beta_n_count / beta_n_count.sum()), len(tcrb['AA']), tcrb['AA'], tcrb["Valid.Chain"], beta_cdr1_en, beta_cdr1_aa, beta_cdr2_en, beta_cdr2_aa, beta_cdr3_en, beta_cdr3_aa, validchain))