def test(class_, data, mhc, model_path, model='lstm'): ''' Evaluation protocol ''' # print out options print('Testing\nMHC: %s\nData: %s\nModel: %s\nSave path: %s' % (mhc, data, model, model_path)) # load training test_data = Dataset.from_csv(filename=data, sep=',', allele_column_name='mhc', peptide_column_name='peptide', affinity_column_name='IC50(nM)') # set the length if class_.upper() == 'I': mask_len = MHCI_MASK_LEN elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN # apply cut/pad or mask to same length if 'lstm' in model or 'gru' in model or 'attn' in model: test_data.mask_peptides(max_len=mask_len) else: test_data.cut_pad_peptides() # get the allele specific data mhc_test = test_data.get_allele(mhc) # define model if model == 'lstm': model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS)) # compile model model.load_weights(model_path) model.compile(loss='mse', optimizer=Adam(lr=0.001)) # get tensorized values for training test_peptides, test_continuous, test_binary = mhc_test.tensorize_keras( embed_type='softhot') # test preds_continuous, preds_binary = get_predictions(test_peptides, model) test_auc = roc_auc_score(test_binary, preds_continuous) test_f1 = f1_score(test_binary, preds_binary) test_ktau = kendalltau(test_continuous, preds_continuous)[0] print('Test AUC: %.4f, F1: %.4f, KTAU: %.4f' % (test_auc, test_f1, test_ktau))
def predict(class_, peptides_path, mhc, model='lstm', weights_path=None, output=None): ''' Prediction protocol ''' # read peptides peptides = [p.strip() for p in open(peptides_path)] # set the length if class_.upper() == 'I': mask_len = MHCI_MASK_LEN elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN print('Predicting for %d peptides' % (len(peptides))) # apply cut/pad or mask to same length if 'lstm' in model or 'gru' in model: normed_peptides = mask_peptides(peptides, max_len=mask_len) else: normed_peptides = cut_pad_peptides(peptides) # get tensorized values for prediction peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot') # make model print('Building model') # define model if model == 'lstm': model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS)) if weights_path: model.load_weights(weights_path) else: if class_.upper() == 'I': predictor_mhc = closest_mhcI(mhc) elif class_.upper() == 'II': predictor_mhc = closest_mhcII(mhc) print("Closest allele found", predictor_mhc) model.load_weights( os.path.join(MHCNUGGETS_HOME, "saves", "production", predictor_mhc + '.h5')) model.compile(loss='mse', optimizer=Adam(lr=0.001)) # test model preds_continuous, preds_binary = get_predictions(peptides_tensor, model) ic50s = [map_proba_to_ic50(p[0]) for p in preds_continuous] # write out results if output: filehandle = open(output, 'w') else: filehandle = sys.stdout print(','.join(('peptide', 'ic50')), file=filehandle) for i, peptide in enumerate(peptides): print(','.join((peptide, str(ic50s[i]))), file=filehandle)
def calculate_relation(mhc, data, model, weights_dir, mass_spec, rand_negs, ic50_threshold, max_ic50, binary=False, embed_peptides=False): ''' Training protocol ''' print('Calculating tuning MHC for %s' % mhc) relations_dict = {} # get the allele specific data mhc_data, num_positives, num_random_negatives, num_real_negatives = data.get_allele( mhc, mass_spec, rand_negs, ic50_threshold) train_peptides, train_continuous, train_binary = mhc_data.tensorize_keras( embed_type='softhot') best_mhc = '' best_auc = 0 best_f1 = 0 best_ppv_top = 0 num_mhc = len(mhc_data.peptides) for tuning_mhc in sorted(set(data.alleles)): # don't want to tune with ourselves if mhc == tuning_mhc: continue # define the path to save weights try: model_path = os.path.join(weights_dir, tuning_mhc + '.h5') model.load_weights(model_path) except IOError: continue preds_continuous, preds_binary = get_predictions( train_peptides, model, binary, embed_peptides, ic50_threshold, max_ic50) try: auc = roc_auc_score(train_binary, preds_continuous) f1 = f1_score(train_binary, preds_binary) #make preds_continuous, test_binary and preds_binary into a matrix, sort by preds_continous, do predicion on the top npos rows only raveled_preds_continuous = np.array(preds_continuous, dtype='float32').ravel() np_lists = np.array( [raveled_preds_continuous, preds_binary, train_binary]) columns = ['pred_cont', 'pred_bin', 'true_bin'] dframe = pd.DataFrame(np_lists.T, columns=columns) dframe.sort_values('pred_cont', inplace=True, ascending=False) dframe_head = dframe.head(num_positives) sorted_pred_cont = dframe_head['pred_cont'].tolist() sorted_pred_bin = dframe_head['pred_bin'].tolist() sorted_true_bin = dframe_head['true_bin'].tolist() ppv_top = precision_score(sorted_true_bin, sorted_pred_bin, pos_label=1) #print ('MHC: %s, AUC: %.4f, F1: %.4f, KTAU: %.4f' % (tuning_mhc, # auc, # f1, # ktau)) if auc > best_auc: best_auc_mhc = tuning_mhc best_auc = auc if f1 > best_f1: best_f1_mhc = tuning_mhc best_f1 = f1 if ppv_top > best_ppv_top: best_ppv_top_mhc = tuning_mhc best_ppv_top = ppv_top adata, num_pos, num_rand_neg, num_real_neg = data.get_allele( tuning_mhc, mass_spec, rand_negs, ic50_threshold) num_tuning_mhc = len(adata.peptides) except ValueError: continue return best_auc_mhc, best_auc, best_f1_mhc, best_f1, best_ppv_top_mhc, best_ppv_top, num_mhc, num_tuning_mhc
def train(class_, data, mhc, save_path, n_epoch, model='lstm', lr=0.001, transfer_path=None): ''' Training protocol ''' # store model name model_name = model # print out options print('Training\nMHC: %s\nData: %s\nModel: %s\nSave path: %s\nTransfer: %s' % (mhc, data, model, save_path, transfer_path)) # load training train_data = Dataset.from_csv(filename=data, sep=',', allele_column_name='mhc', peptide_column_name='peptide', affinity_column_name='IC50(nM)') # set the length if class_.upper() == 'I': mask_len = MHCI_MASK_LEN elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN # apply cut/pad or mask to same length if 'lstm' in model or 'gru' in model or 'attn' in model: train_data.mask_peptides(max_len=mask_len) else: train_data.cut_pad_peptides() # get the allele specific data mhc_train = train_data.get_allele(mhc) print('Training on %d peptides' % len(mhc_train.peptides)) # define model if model == 'lstm': model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS)) # check if we need to do transfer learning if transfer_path: model.load_weights(transfer_path) # compile model model.compile(loss='mse', optimizer=Adam(lr=0.001)) # get tensorized values for training train_peptides, train_continuous, train_binary = mhc_train.tensorize_keras(embed_type='softhot') # convergence criterion highest_f1 = -1 for epoch in range(n_epoch): # train model.fit(train_peptides, train_continuous, epochs=1, verbose=0) # test model on training data train_preds_cont, train_preds_bin = get_predictions(train_peptides, model) train_auc = roc_auc_score(train_binary, train_preds_cont) train_f1 = f1_score(train_binary, train_preds_bin) train_ktau = kendalltau(train_continuous, train_preds_cont)[0] print('epoch %d / %d' % (epoch, n_epoch)) print('Train AUC: %.4f, F1: %.4f, KTAU: %.4f' % (train_auc, train_f1, train_ktau)) # convergence if train_f1 > highest_f1: highest_f1 = train_f1 best_epoch = epoch model.save_weights(save_path) print('Done!')
def train(class_, data, mhc, save_path, n_epoch, model='lstm', lr=0.001, transfer_path=None, mass_spec=False, ic50_threshold=500, max_ic50=50000): ''' Training protocol ''' # store model name model_name = model # print out options print('Training\nMHC: %s\nData: %s\nModel: %s\nSave path: %s\nTransfer: %s\nMassSpec: %s' % (mhc, data, model, save_path, transfer_path, mass_spec)) # load training train_data = Dataset.from_csv(filename=data, ic50_threshold=ic50_threshold, max_ic50=max_ic50, sep=',', allele_column_name='mhc', peptide_column_name='peptide', affinity_column_name='IC50(nM)', type_column_name='measurement_type', source_column_name='measurement_source' ) # set the length if class_.upper() == 'I': mask_len = MHCI_MASK_LEN elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN train_data.mask_peptides(max_len=mask_len) # get the allele specific data mhc_train, n_pos, n_rand_neg, n_real_neg = train_data.get_allele(mhc, mass_spec, ic50_threshold) """ #calculate the composition of the actual training set that will be used print('Training on %d peptides' % len(mhc_train.peptides)) print(str(n_pos) + ' positives ') print(str(n_real_neg) + ' real_negatives ') if n_real_neg != 0: real_skew = math.fabs(math.log((float(n_pos) / float(n_real_neg)))) else: real_skew = "ND" print(str(real_skew) + ' real skew') print(str(n_rand_neg) + ' random negatives ') n_all_neg = n_real_neg + n_rand_neg if n_real_neg + n_rand_neg != 0: total_skew = math.fabs(math.log((float(n_pos) / float(n_all_neg)))) #including random negs else: total_skew = "ND" print(str(total_skew) + 'total skew after random negs added') """ # define model input_size = (mask_len, NUM_AAS) model = mhcnuggets_lstm(input_size) # check if we need to do transfer learning if transfer_path: model.load_weights(transfer_path) #select appropriate loss function for binding affinity data (continuous) or mass spec data (binary) if mass_spec: model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001)) else: model.compile(loss='mse', optimizer=Adam(lr=0.001)) # convergence criterion # highest_f1 = -1 highest_ppv_top = -1 # get tensorized values of the whole dataset for epoch training and for testing train_peptides, train_continuous, train_binary = mhc_train.tensorize_keras(embed_type='softhot') for epoch in range(n_epoch): # train model.fit(train_peptides, train_continuous, epochs=1, verbose=0) # test model on training data train_preds_cont, train_preds_bin = get_predictions(train_peptides, model) train_auc = roc_auc_score(train_binary, train_preds_cont) train_f1 = f1_score(train_binary, train_preds_bin) train_ktau = kendalltau(train_continuous, train_preds_cont)[0] raveled_train_preds_cont = np.array(train_preds_cont, dtype='float32').ravel() train_pearsonr = pearsonr(train_continuous, raveled_train_preds_cont)[0] train_ppv = precision_score(train_binary, train_preds_bin, pos_label=1) #make train_preds_cont, train_binary and train_preds_bin into a matrix, sort by train_preds_cont, do predicion on the top npos rows only np_lists = np.array([raveled_train_preds_cont, train_preds_bin, train_binary]) columns = ['pred_cont','pred_bin','true_bin'] dframe = pd.DataFrame(np_lists.T,columns=columns) dframe.sort_values('pred_cont',inplace=True, ascending=False) dframe_head = dframe.head(n_pos) sorted_pred_cont = dframe_head['pred_cont'].tolist() sorted_pred_bin = dframe_head['pred_bin'].tolist() sorted_true_bin = dframe_head['true_bin'].tolist() train_ppv_top = precision_score(sorted_true_bin, sorted_pred_bin, pos_label=1) print('epoch %d / %d' % (epoch, n_epoch)) print('Num pos: %.4f\nTrain AUC: %.4f, F1: %.4f, KTAU: %.4f, PCC: %.4f, PPV: %.4f, PPVtop: %.4f' % (n_pos, train_auc, train_f1, train_ktau, train_pearsonr, train_ppv, train_ppv_top)) # convergence if train_ppv_top > highest_ppv_top: highest_ppv_top = train_ppv_top best_epoch = epoch model.save_weights(save_path) print('Done!')
def predict(class_, peptides_path, mhc, pickle_path='data/production/examples_per_allele.pkl', model='lstm', model_weights_path="saves/production/", output=None, mass_spec=False, ic50_threshold=500, max_ic50=50000, embed_peptides=False, binary_preds=False, ba_models=False): ''' Prediction protocol ''' # read peptides peptides = [p.strip() for p in open(peptides_path)] # set the length if class_.upper() == 'I': mask_len = MHCI_MASK_LEN input_size = (MHCI_MASK_LEN, NUM_AAS) elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN input_size = (MHCII_MASK_LEN, NUM_AAS) print('Predicting for %d peptides' % (len(peptides))) # apply cut/pad or mask to same length normed_peptides, original_peptides = mask_peptides(peptides, max_len=mask_len) # get tensorized values for prediction peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot') # make model print('Building model') model = mhcnuggets_lstm(input_size) if class_.upper() == 'I': predictor_mhc = closest_mhcI(mhc, pickle_path) elif class_.upper() == 'II': predictor_mhc = closest_mhcII(mhc, pickle_path) print("Closest allele found", predictor_mhc) if model_weights_path != "saves/production/": print('Predicting with user-specified model: ' + model_weights_path) model.load_weights(model_weights_path) elif ba_models: print('Predicting with only binding affinity trained models') model.load_weights( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA.h5')) elif os.path.isfile( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA_to_HLAp.h5')): print('BA_to_HLAp model found, predicting with BA_to_HLAp model...') model.load_weights( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA_to_HLAp.h5')) else: print('No BA_to_HLAp model found, predicting with BA model...') model.load_weights( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA.h5')) if mass_spec: model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001)) else: model.compile(loss='mse', optimizer=Adam(lr=0.001)) # test model preds_continuous, preds_binary = get_predictions(peptides_tensor, model, binary_preds, embed_peptides, ic50_threshold, max_ic50) ic50s = [map_proba_to_ic50(p[0], max_ic50) for p in preds_continuous] # write out results if output: filehandle = open(output, 'w') else: filehandle = sys.stdout print(','.join(('peptide', 'ic50')), file=filehandle) for i, peptide in enumerate(original_peptides): print(','.join((peptide, str(round(ic50s[i], 2)))), file=filehandle)
def test(class_, data, mhc, model_path, model='lstm', mass_spec=False, ic50_threshold=500, max_ic50=50000): ''' Evaluation protocol ''' # print out options print( 'Testing\nMHC: %s\nData: %s\nModel: %s\nSave path: %s\nMass spec: %s\nIC50 threshold: %s\nMax IC50: %s\n' % (mhc, data, model, model_path, mass_spec, ic50_threshold, max_ic50)) # load training test_data = Dataset.from_csv(filename=data, ic50_threshold=ic50_threshold, max_ic50=max_ic50, sep=',', allele_column_name='mhc', peptide_column_name='peptide', affinity_column_name='IC50(nM)', type_column_name='measurement_type', source_column_name='measurement_source') # define model if class_.upper() == 'I': mask_len = MHCI_MASK_LEN input_size = (MHCI_MASK_LEN, NUM_AAS) elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN input_size = (MHCII_MASK_LEN, NUM_AAS) model = mhcnuggets_lstm(input_size) test_data.mask_peptides(max_len=mask_len) # get the allele specific data mhc_test, npos, nrandneg, nrealneg = test_data.get_allele(mhc, mass_spec, ic50_threshold, length=None) # compile model model.load_weights(model_path) if mass_spec: model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001)) else: model.compile(loss='mse', optimizer=Adam(lr=0.001)) # get tensorized values for testing test_peptides, test_continuous, test_binary = mhc_test.tensorize_keras( embed_type='softhot') # test preds_continuous, preds_binary = get_predictions( test_peptides, model, ic50_threshold=ic50_threshold, max_ic50=max_ic50) test_auc = roc_auc_score(test_binary, preds_continuous) test_f1 = f1_score(test_binary, preds_binary) test_ktau = kendalltau(test_continuous, preds_continuous)[0] raveled_preds_continuous = np.array(preds_continuous, dtype='float32').ravel() test_pearsonr = pearsonr(test_continuous, raveled_preds_continuous)[0] test_ppv = precision_score(test_binary, preds_binary, pos_label=1) #make preds_continuous, test_binary and preds_binary into a matrix, sort by preds_continous, do predicion on the top npos rows only np_lists = np.array([raveled_preds_continuous, preds_binary, test_binary]) columns = ['pred_cont', 'pred_bin', 'true_bin'] dframe = pd.DataFrame(np_lists.T, columns=columns) dframe.sort_values('pred_cont', inplace=True, ascending=False) dframe_head = dframe.head(npos) sorted_pred_cont = dframe_head['pred_cont'].tolist() sorted_pred_bin = dframe_head['pred_bin'].tolist() sorted_true_bin = dframe_head['true_bin'].tolist() test_ppv_top = precision_score(sorted_true_bin, sorted_pred_bin, pos_label=1) print( 'Test AUC: %.4f, F1: %.4f, KTAU: %.4f, PCC: %.4f, PPV: %.4f, PPVtop: %.4f' % (test_auc, test_f1, test_ktau, test_pearsonr, test_ppv, test_ppv_top))