def test_amino_acid_composition(self): for seq in self.simple_sequences: comp = parser.amino_acid_composition(seq, term_aa=True, labels=uppercase) comp_default = parser.amino_acid_composition(seq, labels=uppercase) self.assertEqual(1, comp['nterm' + seq[0]]) if len(seq) > 1: self.assertEqual(1, comp['cterm' + seq[-1]]) self.assertEqual(sum(comp_default.values()), sum(comp.values()))
def add_aa_counts(self): aa_counts = pd.DataFrame.from_records( [parser.amino_acid_composition(sequence) for sequence in self.data_frame['sequence']]) \ .fillna(0, downcast='infer') aa_counts.columns = [ '{} count'.format(column) for column in aa_counts.columns ] self.data_frame = pd.concat([self.data_frame, aa_counts], axis=1)
def add_relative_counts(self): record_list = [] for sequence in self.data_frame['sequence']: record_list.append({ k: v / float(parser.length(sequence)) for k, v in parser.amino_acid_composition(sequence).items() }) aa_counts = pd.DataFrame.from_records(record_list).fillna( 0, downcast='infer') aa_counts.columns = [ '{} relative_count'.format(column) for column in aa_counts.columns ] self.data_frame = pd.concat([self.data_frame, aa_counts], axis=1)
def test_modparser(self): """ """ df = pd.read_csv("data/TestModParser.csv") df["Sequence"] = df["Sequence"].apply(PF.remove_brackets) df["Sequence"] = df["Sequence"].apply(PF.replace_numbers) mod_dic, mods_seq = PF.extract_modifications(df["Sequence"], True) df["detected"] = [i[0] for i in mods_seq] df["Test"] = [ True if i == j else False for i, j in zip(df["Mods"], df["detected"]) ] #%% mods = [i[1] + i[0] for i in mod_dic.items()] for seqi in df["NewSeqs"]: print(seqi) print( parser.amino_acid_composition(seqi, labels=parser.std_labels + mods)) #%% assert (True == df["Test"].all())
def test_amino_acid_composition_simple(self): for seq in self.simple_sequences: comp = parser.amino_acid_composition(seq, labels=uppercase) for aa in set(seq): self.assertEqual(seq.count(aa), comp[aa])
def get_AA_matrix(sequences, pos_specific=False, ntermini=5, lcp=1, mods=0, correct=False, residues=parser.std_amino_acids): """ Counts the amino acid in a peptide sequence. Counting uses the pyteomics amino_acid composition. Modified residues of the pattern "modA" are already supported. If the modifications should not be considered another sequence column can be used. As read on the pyteomics doc an "lcp" factor can substantially increase the prediction accuracy. Parameters: ----------------------------------- df: ar, with sequences seq_column: string, sequence column that is used to generate the features mods: bool, 1 (default) or zero. If one: oxM and M area treated as different entities. Examples: ----------------------------------- #modification and termini supporting >>mystr = "nAAAAAAAAAAAAAAAGAAGcK" #just aa composition >>mystr = "AAAAAAAAAAAAAAAGAAGK" Returns: -------------------------------------- df: dataframe with amino acid count columns """ df = pd.DataFrame() df["Sequence"] = sequences.copy() #create dataframe with counts aa_counts = [parser.amino_acid_composition(i) for i in df["Sequence"]] aa_count_df = pd.DataFrame(aa_counts).replace(np.nan, 0) #only count without position index if pos_specific: residues_hash = {i: 0 for i in residues} #-1 one since last c-term not sued nfeatures = (2 * ntermini - 1) * len(residues) #init dic with counts #ini dataframe with same row index as df, to overwrite counts count_dic = { j + res + str(i): 0 for res in residues for i in range(0, ntermini) for j in ["N"] } count_dic.update({ j + res + str(i): 0 for res in residues for i in range(1, ntermini) for j in ["C"] }) count_df = pd.DataFrame(np.zeros((df.shape[0], nfeatures))) count_df.columns = sorted(count_dic.keys()) count_df.index = df.index #super inefficient #todo: fixme for ii, rowi in df.iterrows(): #if the peptides are shorter than 2x ntermini, the #counts would overlap. TO avoid this shorten the termini #counts when neceessary seq = rowi["Sequence"] n = len(seq) if (n - 2 * ntermini) < 0: tmp_ntermini = np.floor(n / 2.) else: tmp_ntermini = ntermini #iterate over number of termini, add count if desired (residues) for i in range(0, int(tmp_ntermini)): if seq[i] in residues_hash: nterm = "N" + seq[i] + str(i) count_df.at[ii, nterm] = count_df.loc[ii][nterm] + 1 if seq[-i - 1] in residues_hash: cterm = "C" + seq[-i - 1] + str(i) #sinec the last amino acid is usually K/R don't add unnecessary #features here if i != 0: count_df.at[ii, cterm] = count_df.loc[ii][cterm] + 1 #correct other counts #by substracting the sequence specific counts new_df = aa_count_df.join(count_df) #iterate over columns for res in residues: tmp_df = new_df.filter(regex="(N|C){}\d".format(res)) sums = tmp_df.sum(axis=1) #correct the internal counts new_df[res] = new_df[res] - sums else: return (aa_count_df) #multiply each raw value by a correction term, see pyteomics docu #for details ("lcp") if correct: cfactor = 1. + lcp * np.log(df["Sequence"].apply(len)) new_df = new_df.mul(cfactor, axis=0) new_df = new_df.replace(np.nan, 0) return (new_df)
def handcrafted_features(data, tags): # # DOI 10.1007/s00251-017-1023-5 # Code from https://github.com/bittremieux/TCR-Classifier/blob/master/tcr_classifier.ipynb # Modified to apply handcrafted features twice, once to the alpha chain and again to the beta chain # Modified to handle split for training, validation, and test cohorts # Modified for multinomial classification # # physicochemical amino acid properties basicity = { 'A': 206.4, 'B': 210.7, 'C': 206.2, 'D': 208.6, 'E': 215.6, 'F': 212.1, 'G': 202.7, 'H': 223.7, 'I': 210.8, 'K': 221.8, 'L': 209.6, 'M': 213.3, 'N': 212.8, 'P': 214.4, 'Q': 214.2, 'R': 237.0, 'S': 207.6, 'T': 211.7, 'V': 208.7, 'W': 216.1, 'X': 210.2, 'Y': 213.1, 'Z': 214.9 } hydrophobicity = { 'A': 0.16, 'B': -3.14, 'C': 2.50, 'D': -2.49, 'E': -1.50, 'F': 5.00, 'G': -3.31, 'H': -4.63, 'I': 4.41, 'K': -5.00, 'L': 4.76, 'M': 3.23, 'N': -3.79, 'P': -4.92, 'Q': -2.76, 'R': -2.77, 'S': -2.85, 'T': -1.08, 'V': 3.02, 'W': 4.88, 'X': 4.59, 'Y': 2.00, 'Z': -2.13 } helicity = { 'A': 1.24, 'B': 0.92, 'C': 0.79, 'D': 0.89, 'E': 0.85, 'F': 1.26, 'G': 1.15, 'H': 0.97, 'I': 1.29, 'K': 0.88, 'L': 1.28, 'M': 1.22, 'N': 0.94, 'P': 0.57, 'Q': 0.96, 'R': 0.95, 'S': 1.00, 'T': 1.09, 'V': 1.27, 'W': 1.07, 'X': 1.29, 'Y': 1.11, 'Z': 0.91 } mutation_stability = { 'A': 13, 'C': 52, 'D': 11, 'E': 12, 'F': 32, 'G': 27, 'H': 15, 'I': 10, 'K': 24, 'L': 34, 'M': 6, 'N': 6, 'P': 20, 'Q': 10, 'R': 17, 'S': 10, 'T': 11, 'V': 17, 'W': 55, 'Y': 31 } # feature conversion and generation features_list = [] for chain in ['tra', 'trb']: onehot_encoder = feature_extraction.DictVectorizer(sparse=False) features_list.append( pd.DataFrame(onehot_encoder.fit_transform( data[[chain + '_vgene', chain + '_jgene']].to_dict(orient='records')), columns=onehot_encoder.feature_names_)) # sequence length features_list.append(data[chain + '_cdr3'].apply( lambda sequence: parser.length(sequence)).to_frame().rename( columns={chain + '_cdr3': 'length'})) # number of occurences of each amino acid aa_counts = pd.DataFrame.from_records([ parser.amino_acid_composition(sequence) for sequence in data[chain + '_cdr3'] ]).fillna(0) aa_counts.columns = [ chain + '_count_{}'.format(column) for column in aa_counts.columns ] features_list.append(aa_counts) # physicochemical properties: (average) basicity, (average) hydrophobicity, # (average) helicity, pI, (average) mutation stability features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([basicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_basicity'})) features_list.append(data[chain + '_cdr3'].apply(lambda seq: sum( [hydrophobicity[aa] for aa in seq]) / parser.length(seq)).to_frame( ).rename(columns={chain + '_cdr3': 'avg_hydrophobicity'})) features_list.append( data[chain + '_cdr3'].apply(lambda seq: sum([helicity[aa] for aa in seq]) / parser.length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_helicity'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: electrochem.pI(seq)).to_frame().rename( columns={chain + '_cdr3': 'pI'})) features_list.append(data[chain + '_cdr3'].apply( lambda seq: sum([mutation_stability[aa] for aa in seq]) / parser. length(seq)).to_frame().rename( columns={chain + '_cdr3': 'avg_mutation_stability'})) # peptide mass features_list.append(data[chain + '_cdr3'].apply( lambda seq: mass.fast_mass(seq)).to_frame().rename( columns={chain + '_cdr3': 'mass'})) # positional features # amino acid occurence and physicochemical properties at a given position from the center pos_aa, pos_basicity, pos_hydro, pos_helicity, pos_pI, pos_mutation = [ [] for _ in range(6) ] for sequence in data[chain + '_cdr3']: length = parser.length(sequence) start_pos = -1 * (length // 2) pos_range = list(range(start_pos, start_pos + length)) if length % 2 == 1 else\ list(range(start_pos, 0)) + list(range(1, start_pos + length + 1)) pos_aa.append({ chain + '_pos_{}_{}'.format(pos, aa): 1 for pos, aa in zip(pos_range, sequence) }) pos_basicity.append({ chain + '_pos_{}_basicity'.format(pos): basicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_hydro.append({ chain + '_pos_{}_hydrophobicity'.format(pos): hydrophobicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_helicity.append({ chain + '_pos_{}_helicity'.format(pos): helicity[aa] for pos, aa in zip(pos_range, sequence) }) pos_pI.append({ chain + '_pos_{}_pI'.format(pos): electrochem.pI(aa) for pos, aa in zip(pos_range, sequence) }) pos_mutation.append({ chain + '_pos_{}_mutation_stability'.format(pos): mutation_stability[aa] for pos, aa in zip(pos_range, sequence) }) features_list.append(pd.DataFrame.from_records(pos_aa).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_basicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_hydro).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_helicity).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_pI).fillna(0)) features_list.append(pd.DataFrame.from_records(pos_mutation).fillna(0)) features_list.append(data['weights']) for tag in tags: features_list.append(data['labels_' + tag]) features_list.append(data['split']) # combine all features data_processed = pd.concat(features_list, axis=1) return data_processed