def get_acc(prediction_dict, label_seen_pos, label_unseen_pos, prefix=""): print('\nmodel type {}'.format(prefix)) print('\nsize {}\n'.format(prediction_dict['prediction'].shape)) print('\nwhole') evaluation_metric.print_metrics(eval(prediction_dict)) print('\noriginal') evaluation_metric.print_metrics(eval(prediction_dict, label_seen_pos)) print('\nunseen') evaluation_metric.print_metrics(eval(prediction_dict, label_unseen_pos))
def submitJobs(onto, label_original, count_file, method, path, filter_down): if filter_down == 'none': filter_down = False else: filter_down = True # label_original = pd.read_csv('/u/scratch/d/datduong/deepgo/data/train/deepgo.'+onto+'.csv',sep="\t",header=None) label_original = pd.read_csv(label_original, sep="\t", header=None) label_original = sorted(list( label_original[0])) ## we sort labels in training #### compute accuracy by frequency low, middle, high = get_label_by_count(count_file + '/CountGoInTrain-' + onto + '.tsv') low_index = np.array( [index for index, value in enumerate(label_original) if value in low]) middle_index = np.array([ index for index, value in enumerate(label_original) if value in middle ]) high_index = np.array( [index for index, value in enumerate(label_original) if value in high]) prediction_dict = pickle.load(open(method, "rb")) print('\nsize {}\n'.format(prediction_dict['prediction'].shape)) print('\nwhole {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, path=path, add_name='whole')) print('\nlow {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, low_index, path=path, add_name='low', filter_down=filter_down)) print('\nmiddle {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, middle_index, path=path, add_name='middle')) print('\nhigh {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, high_index, path=path, add_name='high'))
def submitJobs(where, count_file, method, save_file_type, filter_down): if filter_down == 'none': filter_down = False else: filter_down = True os.chdir(where) for onto in ['cc', 'mf', 'bp']: print('\n\ntype {}'.format(onto)) if save_file_type == 'prediction_train_all_on_test': #### data_type = 'data' else: data_type = 'dataExpandGoSet16Jan2020' label_original = pd.read_csv('/u/scratch/d/datduong/deepgo/' + data_type + '/train/deepgo.' + onto + '.csv', sep="\t", header=None) label_original = sorted(list( label_original[0])) ## we sort labels in training #### compute accuracy by frequency low, middle, high = get_label_by_count(count_file + '/CountGoInTrain-' + onto + '.tsv') low_index = np.array([ index for index, value in enumerate(label_original) if value in low ]) middle_index = np.array([ index for index, value in enumerate(label_original) if value in middle ]) high_index = np.array([ index for index, value in enumerate(label_original) if value in high ]) prediction_dict = pickle.load( open( "/u/scratch/d/datduong/deepgo/" + data_type + "/train/fold_1/" + method + "/test-" + onto + "-prediction.pickle", "rb")) path = "/u/scratch/d/datduong/deepgo/" + data_type + "/train/fold_1/" + method + "/" + onto if not os.path.exists(path): os.mkdir(path) print('\nsize {}\n'.format(prediction_dict['prediction'].shape)) print('\nwhole {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, path=path, add_name='whole')) print('\nlow {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, low_index, path=path, add_name='low', filter_down=filter_down)) print('\nmiddle {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, middle_index, path=path, add_name='middle')) print('\nhigh {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, high_index, path=path, add_name='high'))
def do_eval(self, prot_loader, **kwargs): torch.cuda.empty_cache() self.eval() tr_loss = 0 preds = [] true_label = [] for step, batch in enumerate(prot_loader): with torch.no_grad( ): ## no gradient for everything in this section batch = tuple(t for t in batch) if self.args.has_ppi_emb: prot_idx, prot_len, mask, label_ids, prot_interact_emb = batch ## @label_ids must be of size @args.num_label_to_test else: prot_idx, prot_len, mask, label_ids, _ = batch prot_idx = prot_idx[:, 0:int(max(prot_len))] ## trim down mask = mask[:, 0:int(max(prot_len))] if self.args.has_ppi_emb and (self.args.prot_interact_vec_dim > 0): prot_interact_emb = prot_interact_emb.cuda() else: prot_interact_emb = None pred, loss = self.forward(prot_idx.cuda(), mask.cuda(), prot_interact_emb, label_ids.cuda(), **kwargs) # loss = self.classify_loss ( pred, label_ids.cuda() ) tr_loss = tr_loss + loss ## take sgimoid here, if sigmoid was not taken inside @forward if self.loss_type == 'BCEWithLogitsLoss': pred = F.sigmoid(pred) if len(preds) == 0: preds.append(pred.detach().cpu().numpy()) true_label.append(label_ids.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], pred.detach().cpu().numpy(), axis=0) true_label[0] = np.append(true_label[0], label_ids.detach().cpu().numpy(), axis=0) # row array # end eval true_label = true_label[0] preds = preds[0] print('loss {}'.format(tr_loss)) print('pred label') print(preds) print('true label') print(true_label) trackF1macro = {} trackF1micro = {} # metrics["f1_micro"] trackMacroPrecision = {} # [MACRO] accuracy, precision, recall trackMacroRecall = {} trackMicroPrecision = {} trackMicroRecall = {} ##!! DO NOT NEED TO DO THIS ALL THE TIME DURING TRAINING # if self.args.not_train: # rounding = np.arange(.1,1,.4) # else: rounding = [0.5] for round_cutoff in rounding: print('\n\nround cutoff {}'.format(round_cutoff)) preds_round = 1.0 * (round_cutoff < preds) ## converted into 0/1 result = evaluation_metric.all_metrics( preds_round, true_label, yhat_raw=preds, k=[10, 20]) ## we can pass vector of P@k and R@k evaluation_metric.print_metrics(result) if 'full_data' not in trackF1macro: trackF1macro['full_data'] = [result["f1_macro"]] trackF1micro['full_data'] = [result["f1_micro"]] trackMacroPrecision['full_data'] = [result["prec_macro"]] trackMicroPrecision['full_data'] = [result["prec_micro"]] trackMacroRecall['full_data'] = [result["rec_macro"]] trackMicroRecall['full_data'] = [result["rec_micro"]] else: trackF1macro['full_data'].append(result["f1_macro"]) trackF1micro['full_data'].append(result["f1_micro"]) trackMacroPrecision['full_data'].append(result["prec_macro"]) trackMicroPrecision['full_data'].append(result["prec_micro"]) trackMacroRecall['full_data'].append(result["rec_macro"]) trackMicroRecall['full_data'].append(result["rec_micro"]) if ('GoCount' in kwargs) and (self.args.not_train ): ## do not eed to do this all the time print( '\n\nsee if method improves accuracy conditioned on frequency of GO terms' ) ## frequency less than 25 quantile and over 75 quantile ## indexing must be computed ahead of time to to avoid redundant calculation for cutoff in ['quant25', 'quant75', 'betweenQ25Q75']: ## indexing of the column to pull out , @pred is num_prot x num_go result = evaluation_metric.all_metrics( preds_round[:, kwargs[cutoff]], true_label[:, kwargs[cutoff]], yhat_raw=preds[:, kwargs[cutoff]], k=[10, 20]) print("\nless than {} count".format(cutoff)) evaluation_metric.print_metrics(result) if cutoff not in trackF1macro: trackF1macro[cutoff] = [result["f1_macro"]] trackF1micro[cutoff] = [result["f1_micro"]] trackMacroPrecision[cutoff] = [result["prec_macro"]] trackMicroPrecision[cutoff] = [result["prec_micro"]] trackMacroRecall[cutoff] = [result["rec_macro"]] trackMicroRecall[cutoff] = [result["rec_micro"]] else: trackF1macro[cutoff].append(result["f1_macro"]) trackF1micro[cutoff].append(result["f1_micro"]) trackMacroPrecision[cutoff].append( result["prec_macro"]) trackMicroPrecision[cutoff].append( result["prec_micro"]) trackMacroRecall[cutoff].append(result["rec_macro"]) trackMicroRecall[cutoff].append(result["rec_micro"]) ## if self.args.not_train: print('\n\ntracking f1 compile into list\n') # print ('\nmacro f1 prec rec') for k, v in trackF1macro.items(): print('macroF1 ' + k + " " + " ".join(str(s) for s in v)) for k, v in trackMacroPrecision.items(): print('macroPrec ' + k + " " + " ".join(str(s) for s in v)) for k, v in trackMacroRecall.items(): print('macroRec ' + k + " " + " ".join(str(s) for s in v)) # print ('\nmicro f1 prec rec') for k, v in trackF1micro.items(): print('microF1 ' + k + " " + " ".join(str(s) for s in v)) for k, v in trackMicroPrecision.items(): print('microPrec ' + k + " " + " ".join(str(s) for s in v)) for k, v in trackMicroRecall.items(): print('microRec ' + k + " " + " ".join(str(s) for s in v)) output = { 'prediction': preds, 'truth': true_label } ##!! make life easier if we have both return result, output, tr_loss
def submitJobs(where, method, save_file_type, filter_down): os.chdir(where) if filter_down == 'none': filter_down = False else: filter_down = True for onto in ['cc', 'mf', 'bp']: print('\n\ntype {}'.format(onto)) label_original = pd.read_csv( '/u/scratch/d/datduong/deepgo/data/train/deepgo.' + onto + '.csv', sep="\t", header=None) label_original = set(list(label_original[0])) label_large = pd.read_csv( '/u/scratch/d/datduong/deepgo/dataExpandGoSet16Jan2020/train/deepgo.' + onto + '.csv', sep="\t", header=None) label_large = set(list(label_large[0])) label_unseen = sorted(list(label_large - label_original)) label_large = sorted( list(label_large)) ## by default we sort label for the model label_original = sorted(list(label_original)) label_lookup = { value: counter for counter, value in enumerate(label_large) } label_unseen_pos = np.array( [label_lookup[v] for v in label_lookup if v in label_unseen]) label_seen_pos = np.array( [label_lookup[v] for v in label_lookup if v in label_original]) #### want to compute accuracy on original set of labels, then on unseen labels #### possible original set prediction will change because we do joint prediction. so attention weight will affect outcome ##!! prediction_train_all_on_test.pickle save_prediction_expand try: print("/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" + onto + "/" + method + "/" + save_file_type + ".pickle") prediction_dict = pickle.load( open( "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" + onto + "/" + method + "/" + save_file_type + ".pickle", "rb")) except: print('\npass {}'.format(onto)) continue path = "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" + onto + "/" + method print('\nsize {}\n'.format(prediction_dict['prediction'].shape)) # if save_file_type == 'prediction_train_all_on_test': print('\nwhole {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, path=path, add_name='whole')) # if save_file_type == 'save_prediction_expand': print('\noriginal {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, label_seen_pos, path=path, add_name='original')) print('\nunseen {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, label_unseen_pos, path=path, add_name='unseen', filter_down=filter_down))
def submitJobs(where, count_file, method, save_file_type, filter_down): os.chdir(where) if filter_down == 'none': filter_down = False else: filter_down = True for onto in ['cc', 'mf', 'bp']: print('\n\ntype {}'.format(onto)) label_original = pd.read_csv( '/u/scratch/d/datduong/deepgo/data/train/deepgo.' + onto + '.csv', sep="\t", header=None) label_original = sorted(list( label_original[0])) ## we sort labels in training #### compute accuracy by frequency low, middle, high = get_label_by_count(count_file + '/CountGoInTrain-' + onto + '.tsv') low_index = np.array([ index for index, value in enumerate(label_original) if value in low ]) middle_index = np.array([ index for index, value in enumerate(label_original) if value in middle ]) high_index = np.array([ index for index, value in enumerate(label_original) if value in high ]) prediction_dict = pickle.load( open( "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" + onto + "/" + method + "/" + save_file_type + ".pickle", "rb")) path = "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" + onto + "/" + method print('\nsize {}\n'.format(prediction_dict['prediction'].shape)) print('\nwhole {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, path=path, add_name='whole')) print('\nlow {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, low_index, path=path, add_name='low', filter_down=filter_down)) print('\nmiddle {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, middle_index, path=path, add_name='middle')) print('\nhigh {}'.format(onto)) evaluation_metric.print_metrics( eval(prediction_dict, high_index, path=path, add_name='high'))
def submitJobs (where_train,set_type,where_test, add_name, do_split ): ## @do_split is needed if we use metaGO data if add_name == 'none': add_name = "" os.chdir ( '/u/flashscratch/d/datduong/goAndGeneAnnotationDec2018/') test_gene_annot = pickle.load(open(where_test+"/"+set_type+"_gene_annot.pickle","rb")) print ('num of gene to be tested {}'.format(len(test_gene_annot))) print ('\n\nmust use the prot names in the annot, not psiblast outcome\n\n') genes = list (test_gene_annot.keys()) genes.sort() ## alphabet prediction = pickle.load ( open(where_train+"/seq_seq_predict_go_"+add_name+".pickle","rb") ) ## for each gene, fill in the prediction matrix label_index_map = pickle.load ( open (where_train+"/label_index_map.pickle","rb") ) prediction_np = np.zeros( (len(genes), len(label_index_map)) ) for g in genes : if g not in prediction: continue go_assign = list ( prediction[g].keys() ) go_assign.sort() score = [prediction[g][go] for go in go_assign] location = [label_index_map[go] for go in go_assign] ## assign the score prediction_np [ genes.index(g), location ] = score ## convert np into pd to get row names df = pd.DataFrame(prediction_np, index=genes) pickle.dump ( df, open(where_train+"/seq_seq_predict_go_"+add_name+".pd.pickle","wb")) ## filter out to only go terms in training set truth_np = np.zeros( (len(genes), len(label_index_map)) ) for g in genes : if do_split == 1: if ";" in test_gene_annot[g][0]: go_assign = test_gene_annot[g][0].strip().split(";") else: go_assign = test_gene_annot[g][0].strip().split(",") else: go_assign = test_gene_annot[g] # go_assign.sort() go_assign = [re.sub("GO:","",go) for go in go_assign] location = [label_index_map[go] for go in go_assign if go in label_index_map ] ## !! record only GO we saw in training ## assign the score truth_np [ genes.index(g), location ] = 1 print ('animo GO prediction') print (prediction_np) track_prec = [] track_rec = [] for k in [5,10,15,20,25,30,35,40]: animo_go_metric = evaluation_metric.all_metrics ( np.round(prediction_np), truth_np, yhat_raw=prediction_np, k=k ) ## [ 0:(16*3) , :] if k == 5 : evaluation_metric.print_metrics( animo_go_metric ) track_prec.append(animo_go_metric['prec_at_'+str(k)]) track_rec.append(animo_go_metric['rec_at_'+str(k)]) # fmax_val = fmax.f_max ( truth_np, prediction_np, threshold=np.arange(0,1,.02) ) print ('fmax value {}'.format ( fmax_val ) ) print ('precision/recall at K') print (track_prec) print (track_rec) label_bio_type = pickle.load( open( where_train+'/label_bio_type.pickle','rb') ) # common30 = pickle.load ( open(where_train+"/common_index30.pickle","rb")) # label_bio_type['common30'] = common30 for bio_type in label_bio_type: index = label_bio_type [ bio_type ] print ( "\n\n"+bio_type) print ( index[0:10] ) track_prec = [] track_rec = [] for k in [5,10,15,20,25,30,35,40]: animo_go_metric = evaluation_metric.all_metrics ( np.round(prediction_np[: , index]), truth_np[: , index], yhat_raw=prediction_np[: , index], k=k) if k == 5 : evaluation_metric.print_metrics( animo_go_metric ) track_prec.append(animo_go_metric['prec_at_'+str(k)]) track_rec.append(animo_go_metric['rec_at_'+str(k)]) fmax_val = fmax.f_max ( truth_np[: , index], prediction_np[: , index], threshold=np.arange(0,1,.02) ) print ('fmax value {}'.format ( fmax_val ) ) print ('precision/recall at K') print (track_prec) print (track_rec)
def submitJobs (main_dir, data_dir, blast_result_dir, what_set, ontology_type, all_test_label,add_name='none') : if add_name=='none': add_name = "" #### blast and psi-blast will have the same format. ## @all_test_label is file of all labels to be tested, adding this so that we return a matrix num_ob x num_label os.chdir(main_dir) ## labels to be tested all_test_label = pd.read_csv(all_test_label,header=None) print ('\nsort labels to be tested, we do the same when using NN model.') all_test_label = sorted ( list(all_test_label[0]) ) label_lookup = {value:index for index,value in enumerate(all_test_label)} ## prot annotation train set, will be used later to infer assignment in testset ## can only predict what is found in train set if we use blast print ('load go annotation for train data') ## we can convert text into dict on-the-fly # try: # prot_annot = pickle.load ( open (data_dir+'train-'+ontology_type+'.TrueLabel.pickle','rb') ) # except: # train-mf.tsv prot_annot, prot_name_train = MakeGroundTruthText2Dict(data_dir+'train-'+ontology_type+add_name+'.tsv') print ('\nnum of prots in train data {}\n'.format(len(prot_annot))) print ('load go annotation for test data') ## COMMENT get true labels ## COMMENT 'test-'+ontology_type+'.tsv' has different ordering than 'test-'+ontology_type+'-input.tsv' print ('test file name {}'.format(data_dir+'test-'+ontology_type+add_name+'.tsv')) ##!!##!! ground_truth, prot_name_test = load_true_data (data_dir+'test-'+ontology_type+add_name+'.tsv',label_lookup) ##!!##!! print ('\nnum of prots in test data {}\n'.format(len(prot_name_test))) print ('\nread psiblast result') df_psiblast = pd.read_csv ( blast_result_dir+what_set+"-"+ontology_type+".psiblast.txt" , header=None, skip_blank_lines=True ) df_psiblast = df_psiblast.dropna() df_psiblast = df_psiblast.reset_index(drop=True) prot_name_in_psi = sorted ( list ( set (list ( df_psiblast[0] ) ) ) ) print ('\nnum of prots from test found in psiblast {}, we may be unable to find match for all test sequence\n'.format(len(prot_name_in_psi))) print ('\nread blast result') df_blast = pd.read_csv ( blast_result_dir+what_set+"-"+ontology_type+".blast.txt" , header=None,skip_blank_lines=True ) ## should make prediction as a matrix # prediction = {} prediction = np.zeros([len(prot_name_test),len(label_lookup)]) in_psi = set(df_psiblast[0]) in_blast = set(df_blast[0]) for index,this_prot in tqdm(enumerate(prot_name_test)) : if (this_prot not in in_psi) and (this_prot not in in_blast): print ('not found in both blast and psiblast {}'.format(this_prot)) continue df_psiblast_g = df_psiblast[ df_psiblast[0] == this_prot ] df_psiblast_g = df_psiblast_g[ df_psiblast_g[1] != this_prot ] ## don't compare to self df_blast_g = df_blast[ df_blast[0] == this_prot ] df_blast_g = df_blast_g[ df_blast_g[1] != this_prot ] ## don't compare to self psiblast_go_score_array, w_psiblast = tally_over_n_template ( df_psiblast_g, prot_annot ) blast_go_score_array, _ = tally_over_n_template ( df_blast_g, prot_annot ) final_score = {} psiblast_go = list ( psiblast_go_score_array.keys() ) blast_go = list ( blast_go_score_array.keys() ) go_found = set ( psiblast_go + blast_go ) if len(go_found) == 0: ## funky stuffs ?? print ('pass 1st screen in blast+psiblast but not found any go term ?? {}'.format(this_prot)) final_score[this_prot] = None continue for g in go_found: ## average between psiblast and blast if (g in psiblast_go_score_array) and (g in blast_go_score_array) : x1 = psiblast_go_score_array[g] * (1-w_psiblast) + blast_go_score_array[g] * (w_psiblast) if (g in psiblast_go_score_array) and (g not in blast_go_score_array) : x1 = psiblast_go_score_array[g] if (g not in psiblast_go_score_array) and (g in blast_go_score_array) : x1 = blast_go_score_array[g] final_score[g] = x1 ## each GO term has a score for this one protein ## done with this one protein prediction [index] = order_go_score (final_score,label_lookup) ## filter down original set so things run faster # df[~df.countries.isin(countries)] # df_psiblast = df_psiblast[ ~df_psiblast[0].isin([this_prot]) ] # df_blast = df_blast[ ~df_blast[0].isin([this_prot]) ] # if index > 10: # print (prediction[0:10]) # exit() ## finish all proteins pickle.dump ( {'prediction':prediction, 'true_label':ground_truth}, open(blast_result_dir+what_set+"-"+ontology_type+"-prediction.pickle","wb") ) result = evaluation_metric.all_metrics ( np.round(prediction) , ground_truth, yhat_raw=prediction, k=[5,10,15,20,25]) ## we can pass vector of P@k and R@k evaluation_metric.print_metrics( result )