def main(go_file, terms_file, train_data_file, test_data_file, ont): go = Ontology(go_file, with_rels=True) logging.info('GO loaded') go_set = go.get_namespace_terms(NAMESPACES[ont]) terms_df = pd.read_pickle(terms_file) tcnt = 0 print('Total terms', len(terms_df)) for go_id in terms_df['terms']: if go_id in go_set: tcnt += 1 trdf = pd.read_pickle(train_data_file) print('Total train', len(trdf)) cnt = 0 for i, row in trdf.iterrows(): ok = False for go_id in row['annotations']: if go_id in go_set: ok = True break if ok: cnt += 1 print('Number of training proteins', cnt) tsdf = pd.read_pickle(test_data_file) print('Total test', len(tsdf)) cnt = 0 for i, row in tsdf.iterrows(): ok = False for go_id in row['annotations']: if go_id in go_set: ok = True break if ok: cnt += 1 print('Number of testing proteins', cnt) print('Number of terms', tcnt)
def main(train_data_file, test_data_file, ont): go_rels = Ontology('data/go.obo', with_rels=True) # terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl') # terms = terms_df['functions'].values.flatten() # terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(test_data_file) test_annotations = test_df['annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) annotations = list(map(lambda x: set(filter(lambda y: y in go_set, x)), annotations)) cnt = Counter() max_n = 0 for x in annotations: cnt.update(x) print(cnt.most_common(10)) max_n = cnt.most_common(1)[0][1] print(max_n) scores = {} for go_id, n in cnt.items(): score = n / max_n scores[go_id] = score #! IC score? prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for threshold in np.arange(0.005,.5,.01): # # threshold = t / 100.0 preds = [] annots = set() for go_id, score in scores.items(): if score >= threshold: annots.add(go_id) # new_annots = set() # for go_id in annots: # new_annots |= go_rels.get_anchestors(go_id) # new_annots = set(filter(lambda y: y in go_set, new_annots)) for i, row in enumerate(test_df.itertuples()): preds.append(annots.copy()) fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def main(train_data_file, test_data_file, diamond_scores_file, ont): go_rels = Ontology('data/go.obo', with_rels=True) train_df = pd.read_pickle(train_data_file) annotations = train_df['prop_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(test_data_file) test_annotations = test_df['prop_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for t in range(101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in blast_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def main(model_file, terms_file, annotations_file): go_rels = Ontology('data/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() df = pd.read_pickle(annotations_file) annotations = df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) go_rels.calculate_ic(annotations) # df = df[df['orgs'] == '559292'] sl = 0 annotations = df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) prot_ids = df['proteins'].values ids, data = get_data(df['sequences']) # Load CNN model model = load_model(model_file) preds = model.predict(data, batch_size=100, verbose=1) assert preds.shape[1] == len(terms) mf_set = go_rels.get_namespace_terms(NAMESPACES['mf']) # terms = ['GO:0008047'] for l in range(len(terms)): # if terms[l] not in mf_set: # continue deep_preds = {} for i, j in enumerate(ids): prot_id = prot_ids[j] if prot_id not in deep_preds: deep_preds[prot_id] = {} if preds[i, l] >= 0.01: # Filter out very low scores if terms[l] not in deep_preds[prot_id]: deep_preds[prot_id][terms[l]] = preds[i, l] else: deep_preds[prot_id][terms[l]] = max( deep_preds[prot_id][terms[l]], preds[i, l]) go_set = set([terms[l]]) # go_set.remove(FUNC_DICT['mf']) labels = list( map(lambda x: set(filter(lambda y: y in go_set, x)), annotations)) bin_labels = list(map(lambda x: len(x), labels)) pos_cnt = sum(bin_labels) fmax = 0.0 tmax = 0.0 smin = 1000 for t in range(0, 100): threshold = t / 100.0 predictions = [] for i, row in enumerate(df.itertuples()): annots_dict = deep_preds[row.proteins] or {} annots = set() for go_id, score in annots_dict.items(): if score >= threshold: annots.add(go_id) # new_annots = set() # for go_id in annots: # new_annots |= go_rels.get_anchestors(go_id) predictions.append(annots) # Filter classes predictions = list( map(lambda x: set(filter(lambda y: y in go_set, x)), predictions)) fscore, prec, rec, s = evaluate_annotations( go_rels, labels, predictions) # print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print( f'{terms[l]} {pos_cnt} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' )
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont, alpha): alpha /= 100.0 go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) # DeepGOPlus go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) # print(len(go_set)) deep_preds = [] alphas = { NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46 } for i, row in enumerate(test_df.itertuples()): annots_dict = blast_preds[i].copy() for go_id in annots_dict: annots_dict[go_id] *= alphas[go_rels.get_namespace(go_id)] for j, score in enumerate(row.preds): go_id = terms[j] score *= 1 - alphas[go_rels.get_namespace(go_id)] if go_id in annots_dict: annots_dict[go_id] += score else: annots_dict[go_id] = score deep_preds.append(annots_dict) print('AUTHOR DeepGOPlus') print('MODEL 1') print('KEYWORDS sequence alignment.') for i, row in enumerate(test_df.itertuples()): prot_id = row.proteins for go_id, score in deep_preds[i].items(): print(f'{prot_id}\t{go_id}\t{score:.2f}') print('END') return # Propagate scores # deepgo_preds = [] # for annots_dict in deep_preds: # annots = {} # for go_id, score in annots_dict.items(): # for a_id in go_rels.get_anchestors(go_id): # if a_id in annots: # annots[a_id] = max(annots[a_id], score) # else: # annots[a_id] = score # deepgo_preds.append(annots) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print(f'{avg_fp} {avg_ic}') precisions.append(prec) recalls.append(rec) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf') df = pd.DataFrame({'precisions': precisions, 'recalls': recalls}) df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(train_data_file, valid_data_file, terms_file, diamond_scores_file, ont): go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) valid_df = pd.read_pickle(valid_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) valid_annotations = valid_df['annotations'].values valid_annotations = list(map(lambda x: set(x), valid_annotations)) go_rels.calculate_ic(annotations + valid_annotations) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(valid_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) # DeepGOPlus go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = valid_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) best_fmax = 0.0 best_alpha = 0.0 for alpha in range(44, 70): alpha /= 100.0 deep_preds = [] for i, row in enumerate(valid_df.itertuples()): annots_dict = blast_preds[i].copy() for go_id in annots_dict: annots_dict[go_id] *= alpha for j, score in enumerate(row.preds): go_id = terms[j] score *= 1 - alpha if go_id in annots_dict: annots_dict[go_id] += score else: annots_dict[go_id] = score deep_preds.append(annots_dict) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] for t in range(14, 20): threshold = t / 100.0 preds = [] for i, row in enumerate(valid_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s if best_fmax < fmax: best_fmax = fmax best_alpha = alpha print( f'Alpha: {alpha} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' ) print(f'{best_alpha} {best_fmax}')
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont, alpha): alpha /= 100.0 go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() # terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) #### ? notice that @annotations and @test_annotations are used to get IC scores, so we are not allowed to do pre-filtering go_rels.calculate_ic(annotations + test_annotations) go_set = go_rels.get_namespace_terms( NAMESPACES[ont]) #? consider all the MF or CC or BP #### ? filter terms to have only mf ? # terms = [t for t in terms if t in go_set] # print ('number of terms kept from terms_file {}'.format(len(terms))) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) ##!! let's save this pickle.dump(ics, open("data-cafa/ICsValueTable.pickle", "wb")) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i #### # BLAST Similarity (Diamond) #! we can use same call, we have their output diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) #### # DeepGOPlus # go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) #? consider all the MF or CC or BP go_set.remove(FUNC_DICT[ont]) labels = test_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) ##! filter true labels by @go_set print("total labels {}".format(len(go_set))) deep_preds = [] # alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46} for i, row in enumerate( test_df.itertuples()): #! read in prediction of neural net annots_dict = {} # annots_dict = blast_preds[i].copy() #! copy blast score # for go_id in annots_dict: # * set 0 for all @blast_prediction # annots_dict[go_id] = 0 # *= alphas[go_rels.get_namespace(go_id)] #! scale down blast score. for j, score in enumerate(row.preds): #! prediction of @test_df go_id = terms[j] # if go_id not in go_set: #? faster filter of labels because we don't add ancestor anyway # continue # score *= 1 - alphas[go_rels.get_namespace(go_id)] # x *= 1-0.5 --> x = x * (1-0.5) # if go_id in annots_dict: #? should not need this line?? # annots_dict[go_id] += score #! add into blast score # else: #! are we going to see error?? annots_dict[go_id] = score #! replace blast score deep_preds.append(annots_dict) #! later on, we use only @deep_preds # print('AUTHOR DeepGOPlus') # print('MODEL 1') # print('KEYWORDS sequence alignment.') # for i, row in enumerate(test_df.itertuples()): # prot_id = row.proteins # for go_id, score in deep_preds[i].items(): # print(f'{prot_id}\t{go_id}\t{score:.2f}') # print('END') # return # Propagate scores # deepgo_preds = [] # for annots_dict in deep_preds: # annots = {} # for go_id, score in annots_dict.items(): # for a_id in go_rels.get_anchestors(go_id): # if a_id in annots: # annots[a_id] = max(annots[a_id], score) # else: # annots[a_id] = score # deepgo_preds.append(annots) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] print('\nontology {}\n'.format(ont)) #### for threshold in np.arange(0.005, .4, .01): # np.arange(0.005,1,.01) # threshold = t / 100.0 print('\n') preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if go_id not in go_set: #? faster filter of labels because we don't add ancestor anyway continue if score >= threshold: annots.add(go_id) preds.append(annots) ##!! append parent terms or something ?? # new_annots = set() # for go_id in annots: # new_annots |= go_rels.get_anchestors(go_id) # preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) # print ('see 1 prediction') # print (preds[10]) # print ('see 1 label') # print (labels[10]) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print(f'{avg_fp} {avg_ic}') precisions.append(prec) recalls.append(rec) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'\nFmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf') df = pd.DataFrame({'precisions': precisions, 'recalls': recalls}) df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(train_data_file, preds_file, ont): go = Ontology('data/go.obo', with_rels=True) terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl') terms = terms_df['functions'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(preds_file) # Annotations test_annotations = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id in row.gos: if go.has_term(go_id): annots |= go.get_anchestors(go_id) test_annotations.append(annots) go.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # DeepGO go_set = go.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for threshold in np.arange(0.005, 1, .01): # threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for j, score in enumerate(row.predictions): if score >= threshold: annots.add(terms[j]) new_annots = set() for go_id in annots: new_annots |= go.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def helper(train_df, test_df, ont): go = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle('data-cafa/' + ont + '.pkl') terms = terms_df['functions'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = train_df.rename(columns={"gos": "annotations"}) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = test_df.rename(columns={"gos": "annotations"}) # Annotations test_annotations = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id in row.annotations: if go.has_term(go_id): annots |= go.get_anchestors(go_id) test_annotations.append(annots) go.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # DeepGO go_set = go.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for t in range(1, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for j, score in enumerate(row.predictions): if score >= threshold: annots.add(terms[j]) new_annots = set() for go_id in annots: new_annots |= go.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go, labels, preds) precisions.append(prec) recalls.append(rec) print('Fscore: {}, S: {}, threshold: {}'.format(fscore, s, threshold)) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print('Fmax: {:0.3f}, Smin: {:0.3f}, threshold: {}'.format( fmax, smin, tmax)) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print('AUPR: {:0.3f}'.format(aupr)) return [recalls, precisions, aupr]