def main(go_file, uniprot_file, filter_exp, prop_annots, cafa_targets, out_file): go = Ontology(go_file, with_rels=True) proteins, accessions, sequences, annotations, interpros, orgs = load_data( uniprot_file) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'annotations': annotations, 'interpros': interpros, 'orgs': orgs }) if filter_exp: logging.info('Filtering proteins with experimental annotations') index = [] annotations = [] for i, row in enumerate(df.itertuples()): annots = [] for annot in row.annotations: go_id, code = annot.split('|') if is_exp_code(code): annots.append(go_id) # Ignore proteins without experimental annotations if len(annots) == 0: continue index.append(i) annotations.append(annots) df = df.iloc[index] df = df.reset_index() df['annotations'] = annotations if cafa_targets: logging.info('Filtering cafa target proteins') index = [] for i, row in enumerate(df.itertuples()): if is_cafa_target(row.orgs): index.append(i) df = df.iloc[index] df = df.reset_index() if prop_annots: prop_annotations = [] for i, row in df.iterrows(): # Propagate annotations annot_set = set() annots = row['annotations'] for go_id in annots: go_id = go_id.split('|')[0] # In case if it has code annot_set |= go.get_anchestors(go_id) annots = list(annot_set) prop_annotations.append(annots) df['annotations'] = prop_annotations df.to_pickle(out_file) logging.info('Successfully saved %d proteins' % (len(df), ))
def main(go_file, swissprot_file, out_file): go = Ontology(go_file, with_rels=True) if swissprot_file.endswith("gz"): proteins, accessions, sequences, annotations, interpros, orgs = load_data_gzip( swissprot_file) else: proteins, accessions, sequences, annotations, interpros, orgs = load_data( swissprot_file) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'annotations': annotations, 'interpros': interpros, 'orgs': orgs }) logging.info('Filtering proteins with experimental annotations') index = [] annotations = [] for i, row in enumerate(df.itertuples()): annots = [] for annot in row.annotations: go_id, code = annot.split('|') if is_exp_code(code): annots.append(go_id) # Ignore proteins without experimental annotations if len(annots) == 0: continue index.append(i) annotations.append(annots) df = df.iloc[index] df = df.reset_index() df['exp_annotations'] = annotations prop_annotations = [] for i, row in df.iterrows(): # Propagate annotations annot_set = set() annots = row['exp_annotations'] for go_id in annots: annot_set |= go.get_anchestors(go_id) annots = list(annot_set) prop_annotations.append(annots) df['prop_annotations'] = prop_annotations cafa_target = [] for i, row in enumerate(df.itertuples()): if is_cafa_target(row.orgs): cafa_target.append(True) else: cafa_target.append(False) df['cafa_target'] = cafa_target df.to_pickle(out_file) logging.info('Successfully saved %d proteins' % (len(df), ))
def main(go_file, out_file): go = Ontology(go_file, with_rels=False) cc = get_top_classes(go, FUNC_DICT['cc']) mf = get_top_classes(go, FUNC_DICT['mf']) bp = get_top_classes(go, FUNC_DICT['bp']) cc = list( map(lambda x: f'<http://purl.obolibrary.org/obo/{x.replace(":","_")}>', cc)) mf = list( map(lambda x: f'<http://purl.obolibrary.org/obo/{x.replace(":","_")}>', mf)) bp = list( map(lambda x: f'<http://purl.obolibrary.org/obo/{x.replace(":","_")}>', bp)) f = open(out_file, 'w') for id1 in cc: for id2 in mf: for id3 in bp: f.write(id1 + '\t' + id2 + '\n') f.write(id2 + '\t' + id1 + '\n') f.write(id2 + '\t' + id3 + '\n') f.write(id3 + '\t' + id2 + '\n') f.write(id1 + '\t' + id3 + '\n') f.write(id3 + '\t' + id1 + '\n') f.close()
def main(in_file, hp_file, terms_file, out_file, map_file): # Load GO and read list of all terms hp = Ontology(hp_file, with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() df = pd.read_pickle(in_file) mapping = {} with open(map_file) as f: for line in f: it = line.strip().split() mapping[it[1]] = it[0] w = open(out_file, 'w') # w.write('AUTHOR Hoehndorf Lab - DeepGO team\n') # w.write('MODEL 1\n') # w.write('KEYWORDS machine learning, sequence alignment.\n') for row in df.itertuples(): prot_id = row.genes # if prot_id not in mapping: # continue # prot_id = mapping[prot_id] for i, score in enumerate(row.preds): if score >= 0.1: w.write(prot_id + '\t' + terms[i] + '\t%.2f\n' % score) # w.write('END\n') w.close()
def main(go_file, uniprot_file, filter_exp, prop_annots, out_file): go = Ontology(go_file, with_rels=True) proteins, accessions, sequences, annotations, interpros, orgs, genes, gene_names = load_data(uniprot_file) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'annotations': annotations, 'interpros': interpros, 'orgs': orgs, 'genes': genes, 'gene_names': gene_names }) # Filter proteins df = df[df['orgs'] == '9606'] logging.info('Filtering proteins with experimental annotations') index = [] annotations = [] iea_annotations = [] for i, row in enumerate(df.itertuples()): annots = set() iea_annots = set() for annot in row.annotations: go_id, code = annot.split('|') anch_set = go.get_anchestors(go_id) if is_exp_code(code): annots |= anch_set iea_annots |= anch_set annots = list(annots) iea_annots = list(iea_annots) annotations.append(annots) iea_annotations.append(iea_annots) df['exp_annotations'] = annotations df['iea_annotations'] = iea_annotations df.to_pickle(out_file) logging.info('Successfully saved %d proteins' % (len(df),) )
def main(go_file, old_data_file, new_data_file, out_terms_file, train_data_file, test_data_file, min_count): go = Ontology(go_file, with_rels=True) logging.info('GO loaded') df = pd.read_pickle(old_data_file) logging.info('Processing annotations') cnt = Counter() annotations = list() for i, row in df.iterrows(): for term in row['annotations']: cnt[term] += 1 train_prots = set() for row in df.itertuples(): p_id = row.proteins train_prots.add(p_id) df.to_pickle(train_data_file) # Filter terms with annotations more than min_count res = {} for key, val in cnt.items(): if val >= min_count: ont = key.split(':')[0] if ont not in res: res[ont] = [] res[ont].append(key) terms = [] for key, val in res.items(): print(key, len(val)) terms += val logging.info(f'Number of terms {len(terms)}') # Save the list of terms df = pd.DataFrame({'terms': terms}) df.to_pickle(out_terms_file) # Save testing data df = pd.read_pickle(new_data_file) index = [] for i, row in enumerate(df.itertuples()): p_id = row.proteins if p_id not in train_prots: index.append(i) df = df.iloc[index] print('Number of test proteins', len(df)) df.to_pickle(test_data_file)
def main(go_file, terms_file, train_data_file, test_data_file, ont): go = Ontology(go_file, with_rels=True) logging.info('GO loaded') go_set = go.get_namespace_terms(NAMESPACES[ont]) terms_df = pd.read_pickle(terms_file) tcnt = 0 print('Total terms', len(terms_df)) for go_id in terms_df['terms']: if go_id in go_set: tcnt += 1 trdf = pd.read_pickle(train_data_file) print('Total train', len(trdf)) cnt = 0 for i, row in trdf.iterrows(): ok = False for go_id in row['annotations']: if go_id in go_set: ok = True break if ok: cnt += 1 print('Number of training proteins', cnt) tsdf = pd.read_pickle(test_data_file) print('Total test', len(tsdf)) cnt = 0 for i, row in tsdf.iterrows(): ok = False for go_id in row['annotations']: if go_id in go_set: ok = True break if ok: cnt += 1 print('Number of testing proteins', cnt) print('Number of terms', tcnt)
def main(go_file, data_file, out_terms_file, train_data_file, test_data_file, min_count): go = Ontology(go_file, with_rels=True) logging.info('GO loaded') df = pd.read_pickle(data_file) print("DATA FILE", len(df)) logging.info('Processing annotations') cnt = Counter() annotations = list() for i, row in df.iterrows(): for term in row['prop_annotations']: cnt[term] += 1 # Filter terms with annotations more than min_count res = {} for key, val in cnt.items(): if val >= min_count: ont = key.split(':')[0] if ont not in res: res[ont] = [] res[ont].append(key) terms = [] for key, val in res.items(): print(key, len(val)) terms += val logging.info(f'Number of terms {len(terms)}') # Save the list of terms terms_df = pd.DataFrame({'terms': terms}) terms_df.to_pickle(out_terms_file) n = len(df) # Split train/valid index = np.arange(n) train_n = int(n * 0.95) np.random.seed(seed=0) np.random.shuffle(index) train_df = df.iloc[index[:train_n]] test_df = df.iloc[index[train_n:]] print('Number of train proteins', len(train_df)) train_df.to_pickle(train_data_file) print('Number of test proteins', len(test_df)) test_df.to_pickle(test_data_file)
def main(go_file, hp_file, terms_file, preds_file, gene): go = Ontology(go_file, with_rels=True) print('GO loaded') hp = Ontology(hp_file, with_rels=True) print('HP loaded') terms_df = pd.read_pickle(terms_file) global terms terms = terms_df['terms'].values.flatten() labels = terms_df['labels'].values.flatten() print('Phenotypes', len(terms)) global term_set term_set = set(terms) terms_dict = {v: i for i, v in enumerate(terms)} df = pd.read_pickle(preds_file) row = df.loc[df['genes'] == gene] with open(f'data/{gene}.deepgo_annotations.txt', 'w') as f: dg = [x.split('|') for x in row['deepgo_annotations'].values[0]] dg = sorted(dg, key=lambda x: float(x[1]), reverse=True) for go_id, score in dg: name = go.get_term(go_id)['name'] f.write(f'{go_id}\t{name}\t{score}\n') with open(f'data/{gene}.go_annotations.txt', 'w') as f: dg = [x for x in row['go_annotations'].values[0]] for go_id in dg: name = go.get_term(go_id)['name'] f.write(f'{go_id}\t{name}\n') with open(f'data/{gene}.deeppheno_annotations.txt', 'w') as f: dp = [(terms[i], score) for i, score in enumerate(row['preds'].values[0])] dp = sorted(dp, key=lambda x: x[1], reverse=True) for hp_id, score in dp: name = hp.get_term(hp_id)['name'] f.write(f'{hp_id}\t{name}\t{score}\n') if score < 0.01: break
def main(train_data_file, test_data_file, diamond_scores_file, ont): go_rels = Ontology('data/go.obo', with_rels=True) train_df = pd.read_pickle(train_data_file) annotations = train_df['prop_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(test_data_file) test_annotations = test_df['prop_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for t in range(101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in blast_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def main(hp_file, data_file, terms_file, gos_file, out_file, fold, batch_size, epochs, load_model, logger_file, threshold, device, estimators): gos_df = pd.read_pickle(gos_file) gos = gos_df['gos'].values.flatten() gos_dict = {v: i for i, v in enumerate(gos)} # cross validation settings out_file = f'fold{fold}_exp-' + out_file params = {'n_estimators': estimators} print('Params:', params) global hpo hpo = Ontology(hp_file, with_rels=True) terms_df = pd.read_pickle(terms_file) global terms terms = terms_df['terms'].values.flatten() print('Phenotypes', len(terms)) global term_set term_set = set(terms) train_df, valid_df, test_df = load_data(data_file, terms, fold) terms_dict = {v: i for i, v in enumerate(terms)} nb_classes = len(terms) params['nb_classes'] = nb_classes print(len(terms_dict)) test_steps = int(math.ceil(len(test_df) / batch_size)) test_generator = DFGenerator(test_df, gos_dict, terms_dict, len(test_df)) valid_steps = int(math.ceil(len(valid_df) / batch_size)) train_steps = int(math.ceil(len(train_df) / batch_size)) xy_generator = DFGenerator(train_df, gos_dict, terms_dict, len(train_df)) x, y = xy_generator[0] val_generator = DFGenerator(valid_df, gos_dict, terms_dict, len(valid_df)) val_x, val_y = val_generator[0] test_x, test_y = test_generator[0] if load_model: logging.info(f'Loading RandomForest_{estimators} classifier') clf = load(f'data/rf_{estimators}.joblib') else: logging.info('Training RandomForest classifier') clf = RandomForestRegressor(n_estimators=params['n_estimators']) clf.fit(x, y) dump(clf, f'data/rf_{estimators}.joblib') logging.info('Evaluating model') val_preds = clf.predict(val_x) # val_accuracy = accuracy_score(val_preds, val_y) # print('Val accuracy', val_accuracy) preds = clf.predict(test_x) # test_accuracy = accuracy_score(preds, test_y) # print('Test accuracy', test_accuracy) all_terms_df = pd.read_pickle('data/all_terms.pkl') all_terms = all_terms_df['terms'].values all_terms_dict = {v: k for k, v in enumerate(all_terms)} all_labels = np.zeros((len(test_df), len(all_terms)), dtype=np.int32) for i, row in enumerate(test_df.itertuples()): for hp_id in row.hp_annotations: if hp_id in all_terms_dict: all_labels[i, all_terms_dict[hp_id]] = 1 all_preds = np.zeros((len(test_df), len(all_terms)), dtype=np.float32) for i in range(len(test_df)): for j in range(nb_classes): all_preds[i, all_terms_dict[terms[j]]] = preds[i, j] logging.info('Computing performance:') roc_auc = compute_roc(all_labels, all_preds) print('ROC AUC: %.2f' % (roc_auc, )) test_df['preds'] = list(preds) print(test_df) logging.info('Saving predictions') test_df.to_pickle(out_file)
def main(model_file, terms_file, annotations_file): go_rels = Ontology('data/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() df = pd.read_pickle(annotations_file) annotations = df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) go_rels.calculate_ic(annotations) # df = df[df['orgs'] == '559292'] sl = 0 annotations = df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) prot_ids = df['proteins'].values ids, data = get_data(df['sequences']) # Load CNN model model = load_model(model_file) preds = model.predict(data, batch_size=100, verbose=1) assert preds.shape[1] == len(terms) mf_set = go_rels.get_namespace_terms(NAMESPACES['mf']) # terms = ['GO:0008047'] for l in range(len(terms)): # if terms[l] not in mf_set: # continue deep_preds = {} for i, j in enumerate(ids): prot_id = prot_ids[j] if prot_id not in deep_preds: deep_preds[prot_id] = {} if preds[i, l] >= 0.01: # Filter out very low scores if terms[l] not in deep_preds[prot_id]: deep_preds[prot_id][terms[l]] = preds[i, l] else: deep_preds[prot_id][terms[l]] = max( deep_preds[prot_id][terms[l]], preds[i, l]) go_set = set([terms[l]]) # go_set.remove(FUNC_DICT['mf']) labels = list( map(lambda x: set(filter(lambda y: y in go_set, x)), annotations)) bin_labels = list(map(lambda x: len(x), labels)) pos_cnt = sum(bin_labels) fmax = 0.0 tmax = 0.0 smin = 1000 for t in range(0, 100): threshold = t / 100.0 predictions = [] for i, row in enumerate(df.itertuples()): annots_dict = deep_preds[row.proteins] or {} annots = set() for go_id, score in annots_dict.items(): if score >= threshold: annots.add(go_id) # new_annots = set() # for go_id in annots: # new_annots |= go_rels.get_anchestors(go_id) predictions.append(annots) # Filter classes predictions = list( map(lambda x: set(filter(lambda y: y in go_set, x)), predictions)) fscore, prec, rec, s = evaluate_annotations( go_rels, labels, predictions) # print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print( f'{terms[l]} {pos_cnt} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' )
def main(train_data_file, test_data_file, out_file, terms_file, root_class, fold): # Cross validation evaluation out_file = f'fold{fold}_' + out_file test_data_file = f'fold{fold}_' + test_data_file hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) naive_annots = {} for i, row in train_df.iterrows(): for hp_id in row.hp_annotations: if hp_id in naive_annots: naive_annots[hp_id] += 1 else: naive_annots[hp_id] = 1 for hp_id in naive_annots: naive_annots[hp_id] /= 1.0 * len(train_df) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32) for i in range(len(labels)): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = naive_annots[hp_id] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 gene_id = row.genes annots = set() for hp_id, score in naive_annots.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds = [] for i, row in enumerate(test_df.itertuples()): preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold pmax = prec rmax = rec max_preds = preds if smin > s: smin = s test_df['hp_preds'] = max_preds test_df.to_pickle(out_file) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
def main(go_file, data_file, neg_data_file, cls_embeds_file, rel_embeds_file, margin): go = Ontology(go_file, with_rels=False) cls_df = pd.read_pickle(cls_embeds_file) rel_df = pd.read_pickle(rel_embeds_file) nb_classes = len(cls_df) nb_relations = len(rel_df) embeds_list = cls_df['embeddings'].values classes = {v: k for k, v in enumerate(cls_df['classes'])} rembeds_list = rel_df['embeddings'].values relations = {v: k for k, v in enumerate(rel_df['relations'])} size = len(embeds_list[0]) embeds = np.zeros((nb_classes, size), dtype=np.float32) for i, emb in enumerate(embeds_list): embeds[i, :] = emb rs = np.abs(embeds[:, -1]) embeds = embeds[:, :-1] rsize = len(rembeds_list[0]) rembeds = np.zeros((nb_relations, rsize), dtype=np.float32) for i, emb in enumerate(rembeds_list): rembeds[i, :] = emb data, _, _, _ = load_data(data_file, neg_data_file, index=False) print(relations) # Evaluate normal form 1 axioms n = 0 s = 0 for c, d in data['nf1']: if c not in classes or d not in classes: continue n += 1 c, d = classes[c], classes[d] ec = embeds[c, :] rc = rs[c] ed = embeds[d, :] rd = rs[d] if is_inside(ec, rc, ed, rd, margin): s += 1 print('Normal form 1', n, s, s / n) # Normal form 2 axioms n = 0 s = 0 ns = 0 for c, d, e in data['nf2']: if c not in classes or d not in classes or e not in classes: continue n += 1 c, d, e = classes[c], classes[d], classes[e] ec = embeds[c, :] rc = rs[c] ed = embeds[d, :] rd = rs[d] ee = embeds[e, :] re = rs[e] dst = np.linalg.norm(ec - ed) - margin if dst <= rc + rd and (is_inside(ec, rc, ee, re, margin) or is_inside(ed, rd, ee, re, margin)): s += 1 elif (dst > rc and dst > rd) and dst <= rc + rd: x = (dst * dst - rc * rc + rd * rd) / (2 * dst) rx = math.sqrt(rd * rd - x * x) c = x / dst ex = ed + (ec - ed) * c if is_inside(ex, rx, ee, re, margin): s += 1 elif dst > rc + rd: ns += 1 print('Normal form 2', n, s, s / n, ns) # Evaluate normal form 3 axioms # C subclassOf R some D n = 0 # len(data['nf3']) s = 0 for c, r, d in data['nf3']: if c not in classes or d not in classes or r not in relations: continue c, r, d = classes[c], relations[r], classes[d] if r not in [0, 1, 3, 7, 9, 10, 15]: continue n += 1 ec = embeds[c, :] rc = rs[c] ed = embeds[d, :] rd = rs[d] er = rembeds[r, :] ec = ec + er if is_inside(ec, rc, ed, rd, margin): s += 1 print('Normal form 3', n, s, s / n) # Evaluate normal form 4 axioms # R some C subclassOf D n = 0 s = 0 for r, c, d in data['nf4']: if c not in classes or d not in classes or r not in relations: continue n += 1 r, c, d = relations[r], classes[c], classes[d] ec = embeds[c, :] rc = rs[c] ed = embeds[d, :] rd = rs[d] er = rembeds[r, :] ec = ec - er if is_intersect(ec, rc, ed, rd, margin): s += 1 print('Normal form 4', n, s, s / n) # Disjointness axioms n = len(data['disjoint']) s = 0 for c, d, e in data['disjoint']: c, d = classes[c], classes[d] ec = embeds[c, :] rc = rs[c] ed = embeds[d, :] rd = rs[d] if not is_intersect(ec, rc, ed, rd): s += 1 print('Disjointness', n, s, s / n) # plot_embeddings(embeds, rs, classes) return g = {} for i in range(len(embeds)): g[i] = [] for c, d in data['nf1']: g[d].append(c) sub_n = 1000 labels = np.zeros((sub_n, len(embeds)), dtype=np.int8) print('Building labels') for i in range(sub_n): q = deque() for ch in g[i]: q.append(ch) while len(q) > 0: c = q.popleft() for ch in g[c]: q.append(ch) labels[i, c] = 1 print('Running inference') preds = np.zeros((sub_n, len(embeds)), dtype=np.int8) for i in range(sub_n): c = embeds[i, :] rc = rs[i] dst = np.linalg.norm(embeds - c, axis=1) dst = dst + rs - margin subs = (dst <= rc).astype(np.int8) preds[i, :] = subs tp = np.sum((labels == 1) & (preds == 1)) fp = np.sum((labels == 0) & (preds == 1)) fn = np.sum((labels == 1) & (preds == 0)) precision = tp / (fp + tp) recall = tp / (fn + tp) f = 2 * precision * recall / (precision + recall) print(f, precision, recall)
def main(train_data_file, test_data_file, terms_file, out_file, root_class, fold): # Cross validation evaluation out_file = f'fold{fold}_' + out_file test_data_file = f'fold{fold}_' + test_data_file hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32) for i, row in enumerate(test_df.itertuples()): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = row.preds[terms_dict[hp_id]] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): gene_id = row.genes annots_dict = {} for j, score in enumerate(row.preds): hp_id = terms[j] # score = score * (1 - alpha) if hp_id in annots_dict: annots_dict[hp_id] += score else: annots_dict[hp_id] = score annots = set() for hp_id, score in annots_dict.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) new_annots = new_annots.intersection(hp_set) preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold max_preds = preds pmax = prec rmax = rec if smin > s: smin = s test_df['hp_preds'] = max_preds test_df.to_pickle(out_file) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print( f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' ) plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
def main(train_data_file, test_data_file, terms_file, rules_file): hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) rule_annots = {} with open(rules_file) as f: for line in f: it = line.strip().split() go_id = it[0].replace('_', ':') hp_id = it[1].replace('_', ':') if go_id not in rule_annots: rule_annots[go_id] = set() rule_annots[go_id].add(hp_id) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) hp_set_anch = set() for hp_id in hp_set: hp_set_anch |= hp.get_anchestors(hp_id) labels = test_annotations # labels = list(map(lambda x: set(filter(lambda y: y in hp_set_anch, x)), labels)) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): gene_id = row.genes annots = set() for item in row.deepgo_annotations: go_id, score = item.split('|') score = float(score) if score >= threshold and go_id in rule_annots: annots |= rule_annots[go_id] new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold max_preds = preds if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') test_df['hp_preds'] = max_preds test_df.to_pickle('data/predictions_max.pkl') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}')
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont, alpha): alpha /= 100.0 go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() # terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) #### ? notice that @annotations and @test_annotations are used to get IC scores, so we are not allowed to do pre-filtering go_rels.calculate_ic(annotations + test_annotations) go_set = go_rels.get_namespace_terms( NAMESPACES[ont]) #? consider all the MF or CC or BP #### ? filter terms to have only mf ? # terms = [t for t in terms if t in go_set] # print ('number of terms kept from terms_file {}'.format(len(terms))) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) ##!! let's save this pickle.dump(ics, open("data-cafa/ICsValueTable.pickle", "wb")) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i #### # BLAST Similarity (Diamond) #! we can use same call, we have their output diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) #### # DeepGOPlus # go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) #? consider all the MF or CC or BP go_set.remove(FUNC_DICT[ont]) labels = test_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) ##! filter true labels by @go_set print("total labels {}".format(len(go_set))) deep_preds = [] # alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46} for i, row in enumerate( test_df.itertuples()): #! read in prediction of neural net annots_dict = {} # annots_dict = blast_preds[i].copy() #! copy blast score # for go_id in annots_dict: # * set 0 for all @blast_prediction # annots_dict[go_id] = 0 # *= alphas[go_rels.get_namespace(go_id)] #! scale down blast score. for j, score in enumerate(row.preds): #! prediction of @test_df go_id = terms[j] # if go_id not in go_set: #? faster filter of labels because we don't add ancestor anyway # continue # score *= 1 - alphas[go_rels.get_namespace(go_id)] # x *= 1-0.5 --> x = x * (1-0.5) # if go_id in annots_dict: #? should not need this line?? # annots_dict[go_id] += score #! add into blast score # else: #! are we going to see error?? annots_dict[go_id] = score #! replace blast score deep_preds.append(annots_dict) #! later on, we use only @deep_preds # print('AUTHOR DeepGOPlus') # print('MODEL 1') # print('KEYWORDS sequence alignment.') # for i, row in enumerate(test_df.itertuples()): # prot_id = row.proteins # for go_id, score in deep_preds[i].items(): # print(f'{prot_id}\t{go_id}\t{score:.2f}') # print('END') # return # Propagate scores # deepgo_preds = [] # for annots_dict in deep_preds: # annots = {} # for go_id, score in annots_dict.items(): # for a_id in go_rels.get_anchestors(go_id): # if a_id in annots: # annots[a_id] = max(annots[a_id], score) # else: # annots[a_id] = score # deepgo_preds.append(annots) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] print('\nontology {}\n'.format(ont)) #### for threshold in np.arange(0.005, .4, .01): # np.arange(0.005,1,.01) # threshold = t / 100.0 print('\n') preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if go_id not in go_set: #? faster filter of labels because we don't add ancestor anyway continue if score >= threshold: annots.add(go_id) preds.append(annots) ##!! append parent terms or something ?? # new_annots = set() # for go_id in annots: # new_annots |= go_rels.get_anchestors(go_id) # preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) # print ('see 1 prediction') # print (preds[10]) # print ('see 1 label') # print (labels[10]) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print(f'{avg_fp} {avg_ic}') precisions.append(prec) recalls.append(rec) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'\nFmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf') df = pd.DataFrame({'precisions': precisions, 'recalls': recalls}) df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(train_data_file, test_data_file, ont): go_rels = Ontology('data/go.obo', with_rels=True) # terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl') # terms = terms_df['functions'].values.flatten() # terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(test_data_file) test_annotations = test_df['annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) annotations = list(map(lambda x: set(filter(lambda y: y in go_set, x)), annotations)) cnt = Counter() max_n = 0 for x in annotations: cnt.update(x) print(cnt.most_common(10)) max_n = cnt.most_common(1)[0][1] print(max_n) scores = {} for go_id, n in cnt.items(): score = n / max_n scores[go_id] = score #! IC score? prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for threshold in np.arange(0.005,.5,.01): # # threshold = t / 100.0 preds = [] annots = set() for go_id, score in scores.items(): if score >= threshold: annots.add(go_id) # new_annots = set() # for go_id in annots: # new_annots |= go_rels.get_anchestors(go_id) # new_annots = set(filter(lambda y: y in go_set, new_annots)) for i, row in enumerate(test_df.itertuples()): preds.append(annots.copy()) fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def main(benchmark_file, train_data_file, hpo_file, terms_file, root_class): hp = Ontology(hpo_file, with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} noknowledge_prots = set() with open('data-cafa/noknowledge_targets.txt') as f: for line in f: noknowledge_prots.add(line.strip()) bench_annots = {} with open(benchmark_file) as f: for line in f: it = line.strip().split('\t') t_id = it[0] if t_id not in noknowledge_prots: continue hp_id = it[1] if t_id not in bench_annots: bench_annots[t_id] = set() bench_annots[t_id] |= hp.get_anchestors(hp_id) train_df = pd.read_pickle(train_data_file) naive_annots = {} for i, row in train_df.iterrows(): for hp_id in row.hp_annotations: if hp_id in naive_annots: naive_annots[hp_id] += 1 else: naive_annots[hp_id] = 1 for hp_id in naive_annots: naive_annots[hp_id] /= 1.0 * len(train_df) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = [] for t_id, hps in bench_annots.items(): labels.append(hps) labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(bench_annots), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(bench_annots), len(hp_set)), dtype=np.int32) for i in range(len(labels)): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = naive_annots[hp_id] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 annots = set() for hp_id, score in naive_annots.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds = [] for t_id, hps in bench_annots.items(): preds.append(new_annots) fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore pmax = prec rmax = rec tmax = threshold max_preds = preds if smin > s: smin = s precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print( f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' )
def main(go_file, train_data_file, test_data_file, terms_file, model_file, out_file, split, batch_size, epochs, load, logger_file, threshold, device, params_index): params = { 'max_kernel': 129, 'initializer': 'glorot_normal', 'dense_depth': 0, 'nb_filters': 512, 'optimizer': Adam(lr=3e-4), 'loss': 'binary_crossentropy' } # SLURM JOB ARRAY INDEX pi = params_index if params_index != -1: kernels = [33, 65, 129, 257, 513] dense_depths = [0, 1, 2] nb_filters = [32, 64, 128, 256, 512] params['max_kernel'] = kernels[pi % 5] pi //= 5 params['dense_depth'] = dense_depths[pi % 3] pi //= 3 params['nb_filters'] = nb_filters[pi % 5] pi //= 5 out_file = f'data/predictions_{params_index}.pkl' logger_file = f'data/training_{params_index}.csv' model_file = f'data/model_{params_index}.h5' print('Params:', params) go = Ontology(go_file, with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() train_df, valid_df = load_data(train_data_file, terms, split) test_df = pd.read_pickle(test_data_file) terms_dict = {v: i for i, v in enumerate(terms)} nb_classes = len(terms) with tf.device('/' + device): test_steps = int(math.ceil(len(test_df) / batch_size)) test_generator = DFGenerator(test_df, terms_dict, nb_classes, batch_size) if load: logging.info('Loading pretrained model') model = load_model(model_file) else: logging.info('Creating a new model') model = create_model(nb_classes, params) logging.info("Training data size: %d" % len(train_df)) logging.info("Validation data size: %d" % len(valid_df)) checkpointer = ModelCheckpoint(filepath=model_file, verbose=1, save_best_only=True) earlystopper = EarlyStopping(monitor='val_loss', patience=6, verbose=1) logger = CSVLogger(logger_file) logging.info('Starting training the model') valid_steps = int(math.ceil(len(valid_df) / batch_size)) train_steps = int(math.ceil(len(train_df) / batch_size)) train_generator = DFGenerator(train_df, terms_dict, nb_classes, batch_size) valid_generator = DFGenerator(valid_df, terms_dict, nb_classes, batch_size) model.summary() model.fit(train_generator, steps_per_epoch=train_steps, epochs=epochs, validation_data=valid_generator, validation_steps=valid_steps, max_queue_size=batch_size, workers=12, callbacks=[logger, checkpointer, earlystopper]) logging.info('Loading best model') model = load_model(model_file) logging.info('Evaluating model') loss = model.evaluate(test_generator, steps=test_steps) logging.info('Test loss %f' % loss) logging.info('Predicting') test_generator.reset() preds = model.predict(test_generator, steps=test_steps) # valid_steps = int(math.ceil(len(valid_df) / batch_size)) # valid_generator = DFGenerator(valid_df, terms_dict, # nb_classes, batch_size) # logging.info('Predicting') # valid_generator.reset() # preds = model.predict_generator(valid_generator, steps=valid_steps) # valid_df.reset_index() # valid_df['preds'] = list(preds) # train_df.to_pickle('data/train_data_train.pkl') # valid_df.to_pickle('data/train_data_valid.pkl') test_labels = np.zeros((len(test_df), nb_classes), dtype=np.int32) for i, row in enumerate(test_df.itertuples()): for go_id in row.prop_annotations: if go_id in terms_dict: test_labels[i, terms_dict[go_id]] = 1 logging.info('Computing performance:') roc_auc = compute_roc(test_labels, preds) logging.info('ROC AUC: %.2f' % (roc_auc, )) test_df['labels'] = list(test_labels) test_df['preds'] = list(preds) logging.info('Saving predictions') test_df.to_pickle(out_file)
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont): go_rels = Ontology('data/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) print("Length of test set: " + str(len(test_df))) annotations = train_df['prop_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['prop_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] #print('Diamond preds') for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) last_release_metadata = 'metadata/last_release.json' with open(last_release_metadata, 'r') as f: last_release_data = json.load(f) last_release_data['alphas'][ont] = find_alpha(ont, test_df, blast_preds, go_rels, terms) with open(last_release_metadata, 'w') as f: json.dump(last_release_data, f)
def main(in_file, outdirectory, go_file, model_file, terms_file, annotations_file, chunk_size, diamond_file, threshold, batch_size, alpha): global GO, Terms, annotations, diamond_predictions Path(outdirectory).mkdir(exist_ok=True, parents=True) # Load GO and read list of all terms GO = Ontology(go_file, with_rels=True) Terms = pd.read_pickle(terms_file).terms.values.flatten() # Read known experimental annotations annotations = _collate_experimental_annotations(annotations_file) print("{:<20s} ✓".format("Annotations")) # Parse diamond prediction file diamond_predictions = _collate_diamond_predictions(diamond_file, annotations) print("{:<20s} ✓".format("Diamond predictions")) Ns = len(diamond_predictions) # Load CNN model model = load_model(model_file) print("{:<20s} ✓".format("CNN model")) total_seq = 0 start_time = time.time() display = "{i} ({start}, {end}) | time={time}" calc_start = lambda i: i * chunk_size calc_end = lambda i: i * chunk_size + chunk_size - 1 # creates a MP queue for depositing predictions and delegating to # tabulating processes and writer manager = multiprocessing.Manager() write_q = manager.Queue() worker = partial(_process_predictions, alpha=alpha, threshold=threshold) pool = multiprocessing.Pool(processes=25) # create the writer #pool.apply_async(_write_predictions, (write_q, outdirectory)) results = [] for i, (prot_ids, sequences) in enumerate(read_fasta(in_file, chunk_size)): total_seq += len(prot_ids) ids, data = get_data(sequences) start = datetime.now() # make a prediction preds = model.predict(data, batch_size=batch_size) assert preds.shape[1] == len(Terms) # display the time it took and other helpful data elapsed = _calc_elapsed(start, fmt=True) print( display.format(i=i, start=calc_start(i), end=calc_end(i), time=_calc_elapsed(start))) # fire up a prediction tabulator #result = worker(ids, preds, prot_ids) result = pool.apply_async(worker, (ids, preds, prot_ids)) results.append(result) for annot in chain.from_iterable(map(lambda result: result.get(), results)): #for annot in result.get(): filename = Path(outdirectory) / f"{annot['accession']}.json" _write_result(annot, filename) pool.close() pool.join() total_time = time.time() - start_time print('Total prediction time for %d sequences is %d' % (total_seq, total_time))
def main(go_file, train_data_file, valid_data_file, test_data_file, sim_score_file): go = Ontology(go_file, with_rels=False) with open(sim_score_file, 'r') as f: proteins = next(f).strip().split('\t') prots_dict = {v: k for k, v in enumerate(proteins)} sim = np.zeros((len(proteins), len(proteins)), dtype=np.float32) i = 0 for line in f: line = line.replace('null', '0.0') s = line.strip().split('\t') s = np.array(list(map(float, s)), dtype=np.float32) sim[i, :] = s i += 1 train_data = load_data(train_data_file, prots_dict) valid_data = load_data(valid_data_file, prots_dict) trlabels = np.ones((len(proteins), len(proteins)), dtype=np.int32) for c, d in train_data: trlabels[c, d] = 0 for c, d in valid_data: trlabels[c, d] = 0 test_data = load_data(test_data_file, prots_dict) top10 = 0 top100 = 0 mean_rank = 0 ftop10 = 0 ftop100 = 0 fmean_rank = 0 n = len(test_data) labels = np.zeros((len(proteins), len(proteins)), dtype=np.int32) ranks = {} franks = {} with ck.progressbar(test_data) as prog_data: for c, d in prog_data: labels[c, d] = 1 index = rankdata(-sim[c, :], method='average') rank = index[d] if rank <= 10: top10 += 1 if rank <= 100: top100 += 1 mean_rank += rank if rank not in ranks: ranks[rank] = 0 ranks[rank] += 1 # Filtered rank fil = sim[c, :] * (labels[c, :] | trlabels[c, :]) index = rankdata(-fil, method='average') rank = index[d] if rank <= 10: ftop10 += 1 if rank <= 100: ftop100 += 1 fmean_rank += rank if rank not in franks: franks[rank] = 0 franks[rank] += 1 print() top10 /= n top100 /= n mean_rank /= n ftop10 /= n ftop100 /= n fmean_rank /= n rank_auc = compute_rank_roc(ranks, len(proteins)) frank_auc = compute_rank_roc(franks, len(proteins)) print(f'{top10:.2f} {top100:.2f} {mean_rank:.2f} {rank_auc:.2f}') print(f'{ftop10:.2f} {ftop100:.2f} {fmean_rank:.2f} {frank_auc:.2f}')
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont, alpha): alpha /= 100.0 go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) # DeepGOPlus go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) # print(len(go_set)) deep_preds = [] alphas = { NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46 } for i, row in enumerate(test_df.itertuples()): annots_dict = blast_preds[i].copy() for go_id in annots_dict: annots_dict[go_id] *= alphas[go_rels.get_namespace(go_id)] for j, score in enumerate(row.preds): go_id = terms[j] score *= 1 - alphas[go_rels.get_namespace(go_id)] if go_id in annots_dict: annots_dict[go_id] += score else: annots_dict[go_id] = score deep_preds.append(annots_dict) print('AUTHOR DeepGOPlus') print('MODEL 1') print('KEYWORDS sequence alignment.') for i, row in enumerate(test_df.itertuples()): prot_id = row.proteins for go_id, score in deep_preds[i].items(): print(f'{prot_id}\t{go_id}\t{score:.2f}') print('END') return # Propagate scores # deepgo_preds = [] # for annots_dict in deep_preds: # annots = {} # for go_id, score in annots_dict.items(): # for a_id in go_rels.get_anchestors(go_id): # if a_id in annots: # annots[a_id] = max(annots[a_id], score) # else: # annots[a_id] = score # deepgo_preds.append(annots) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print(f'{avg_fp} {avg_ic}') precisions.append(prec) recalls.append(rec) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf') df = pd.DataFrame({'precisions': precisions, 'recalls': recalls}) df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(train_data_file, valid_data_file, terms_file, diamond_scores_file, ont): go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) valid_df = pd.read_pickle(valid_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) valid_annotations = valid_df['annotations'].values valid_annotations = list(map(lambda x: set(x), valid_annotations)) go_rels.calculate_ic(annotations + valid_annotations) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(valid_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) # DeepGOPlus go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = valid_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) best_fmax = 0.0 best_alpha = 0.0 for alpha in range(44, 70): alpha /= 100.0 deep_preds = [] for i, row in enumerate(valid_df.itertuples()): annots_dict = blast_preds[i].copy() for go_id in annots_dict: annots_dict[go_id] *= alpha for j, score in enumerate(row.preds): go_id = terms[j] score *= 1 - alpha if go_id in annots_dict: annots_dict[go_id] += score else: annots_dict[go_id] = score deep_preds.append(annots_dict) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] for t in range(14, 20): threshold = t / 100.0 preds = [] for i, row in enumerate(valid_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s if best_fmax < fmax: best_fmax = fmax best_alpha = alpha print( f'Alpha: {alpha} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' ) print(f'{best_alpha} {best_fmax}')
def main(go_file, train_data_file, valid_data_file, test_data_file, cls_embeds_file, rel_embeds_file, margin, params_array_index): embedding_size = 50 reg_norm = 1 org = 'human' go = Ontology(go_file, with_rels=False) pai = params_array_index if params_array_index != -1: orgs = ['human', 'yeast'] sizes = [50, 100, 200, 400] margins = [-0.1, -0.01, 0.0, 0.01, 0.1] reg_norms = [1,] reg_norm = reg_norms[0] # params_array_index //= 2 margin = margins[params_array_index % 5] params_array_index //= 5 embedding_size = sizes[params_array_index % 4] params_array_index //= 4 org = orgs[params_array_index % 2] print('Params:', org, embedding_size, margin, reg_norm) if org == 'human': train_data_file = f'data/data-train/9606.protein.links.v10.5.txt' valid_data_file = f'data/data-valid/9606.protein.links.v10.5.txt' test_data_file = f'data/data-test/9606.protein.links.v10.5.txt' cls_embeds_file = f'data/{org}_{pai}_{embedding_size}_{margin}_{reg_norm}_cls.pkl' rel_embeds_file = f'data/{org}_{pai}_{embedding_size}_{margin}_{reg_norm}_rel.pkl' loss_file = f'data/{org}_{pai}_{embedding_size}_{margin}_{reg_norm}_loss.csv' if os.path.exists(loss_file): df = pd.read_csv(loss_file) print('Loss:', df['loss'].values[-1]) cls_df = pd.read_pickle(cls_embeds_file) rel_df = pd.read_pickle(rel_embeds_file) nb_classes = len(cls_df) nb_relations = len(rel_df) embeds_list = cls_df['embeddings'].values classes = {v: k for k, v in enumerate(cls_df['classes'])} rembeds_list = rel_df['embeddings'].values relations = {v: k for k, v in enumerate(rel_df['relations'])} size = len(embeds_list[0]) embeds = np.zeros((nb_classes, size), dtype=np.float32) for i, emb in enumerate(embeds_list): embeds[i, :] = emb proteins = {} for k, v in classes.items(): if not k.startswith('<http://purl.obolibrary.org/obo/GO_'): proteins[k] = v rs = np.abs(embeds[:, -1]).reshape(-1, 1) embeds = embeds[:, :-1] prot_index = list(proteins.values()) prot_rs = rs[prot_index, :] prot_embeds = embeds[prot_index, :] prot_dict = {v: k for k, v in enumerate(prot_index)} rsize = len(rembeds_list[0]) rembeds = np.zeros((nb_relations, rsize), dtype=np.float32) for i, emb in enumerate(rembeds_list): rembeds[i, :] = emb train_data = load_data(train_data_file, classes, relations) valid_data = load_data(valid_data_file, classes, relations) trlabels = {} for c, r, d in train_data: c, r, d = prot_dict[classes[c]], relations[r], prot_dict[classes[d]] if r not in trlabels: trlabels[r] = np.ones((len(prot_embeds), len(prot_embeds)), dtype=np.int32) trlabels[r][c, d] = 1000 # for c, r, d in valid_data: # c, r, d = prot_dict[classes[c]], relations[r], prot_dict[classes[d]] # if r not in trlabels: # trlabels[r] = np.ones((len(prot_embeds), len(prot_embeds)), dtype=np.int32) # trlabels[r][c, d] = 1000 test_data = load_data(test_data_file, classes, relations) top1 = 0 top10 = 0 top100 = 0 mean_rank = 0 ftop1 = 0 ftop10 = 0 ftop100 = 0 fmean_rank = 0 labels = {} preds = {} ranks = {} franks = {} eval_data = test_data n = len(eval_data) with ck.progressbar(eval_data) as prog_data: for c, r, d in prog_data: c, r, d = prot_dict[classes[c]], relations[r], prot_dict[classes[d]] if r not in labels: labels[r] = np.zeros((len(prot_embeds), len(prot_embeds)), dtype=np.int32) if r not in preds: preds[r] = np.zeros((len(prot_embeds), len(prot_embeds)), dtype=np.float32) labels[r][c, d] = 1 ec = prot_embeds[c, :] rc = prot_rs[c, :] er = rembeds[r, :] ec += er dst = np.linalg.norm(prot_embeds - ec.reshape(1, -1), axis=1) dst = dst.reshape(-1, 1) # if rc > 0: # overlap = np.maximum(0, (2 * rc - np.maximum(dst + rc - prot_rs - margin, 0)) / (2 * rc)) # else: # overlap = (np.maximum(dst - prot_rs - margin, 0) == 0).astype('float32') # edst = np.maximum(0, dst - rc - prot_rs - margin) # res = (overlap + 1 / np.exp(edst)) / 2 res = np.maximum(0, dst - rc - prot_rs - margin) res = res.flatten() preds[r][c, :] = res index = rankdata(res, method='average') rank = index[d] if rank == 1: top1 += 1 if rank <= 10: top10 += 1 if rank <= 100: top100 += 1 mean_rank += rank if rank not in ranks: ranks[rank] = 0 ranks[rank] += 1 # Filtered rank index = rankdata((res * trlabels[r][c, :]), method='average') rank = index[d] if rank == 1: ftop1 += 1 if rank <= 10: ftop10 += 1 if rank <= 100: ftop100 += 1 fmean_rank += rank if rank not in franks: franks[rank] = 0 franks[rank] += 1 top1 /= n top10 /= n top100 /= n mean_rank /= n ftop1 /= n ftop10 /= n ftop100 /= n fmean_rank /= n rank_auc = compute_rank_roc(ranks, len(proteins)) frank_auc = compute_rank_roc(franks, len(proteins)) print(f'{org} {embedding_size} {margin} {reg_norm} {top10:.2f} {top100:.2f} {mean_rank:.2f} {rank_auc:.2f}') print(f'{org} {embedding_size} {margin} {reg_norm} {ftop10:.2f} {ftop100:.2f} {fmean_rank:.2f} {frank_auc:.2f}')
def main(in_file, out_file, go_file, model_file, terms_file, annotations_file, chunk_size, diamond_file, threshold, batch_size, alpha): # Load GO and read list of all terms go = Ontology(go_file, with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() # Read known experimental annotations annotations = {} df = pd.read_pickle(annotations_file) for row in df.itertuples(): annotations[row.proteins] = set(row.prop_annotations) go.calculate_ic(annotations.values()) diamond_preds = {} mapping = {} with gzip.open(diamond_file, 'rt') as f: for line in f: it = line.strip().split() if it[0] not in mapping: mapping[it[0]] = {} mapping[it[0]][it[1]] = float(it[2]) for prot_id, sim_prots in mapping.items(): annots = {} allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[p_id] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[p_id]: s += score sim[j] = s / total_score for go_id, score in zip(allgos, sim): annots[go_id] = score diamond_preds[prot_id] = annots # Load CNN model model = load_model(model_file) # Alphas for the latest model alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46} # Alphas for the cafa2 model # alphas = {NAMESPACES['mf']: 0.63, NAMESPACES['bp']: 0.68, NAMESPACES['cc']: 0.48} start_time = time.time() total_seq = 0 w = gzip.open(out_file, 'wt') for prot_ids, sequences in read_fasta(in_file, chunk_size): total_seq += len(prot_ids) deep_preds = {} ids, data = get_data(sequences) preds = model.predict(data, batch_size=batch_size) assert preds.shape[1] == len(terms) for i, j in enumerate(ids): prot_id = prot_ids[j] if prot_id not in deep_preds: deep_preds[prot_id] = {} for l in range(len(terms)): if preds[i, l] >= 0.01: # Filter out very low scores if terms[l] not in deep_preds[prot_id]: deep_preds[prot_id][terms[l]] = preds[i, l] else: deep_preds[prot_id][terms[l]] = max( deep_preds[prot_id][terms[l]], preds[i, l]) # Combine diamond preds and deepgo for prot_id in prot_ids: annots = {} if prot_id in diamond_preds: for go_id, score in diamond_preds[prot_id].items(): annots[go_id] = score * alphas[go.get_namespace(go_id)] for go_id, score in deep_preds[prot_id].items(): if go_id in annots: annots[go_id] += (1 - alphas[go.get_namespace(go_id)]) * score else: annots[go_id] = (1 - alphas[go.get_namespace(go_id)]) * score # Propagate scores with ontology structure gos = list(annots.keys()) for go_id in gos: for g_id in go.get_anchestors(go_id): if g_id in annots: annots[g_id] = max(annots[g_id], annots[go_id]) else: annots[g_id] = annots[go_id] sannots = sorted(annots.items(), key=lambda x: x[1], reverse=True) for go_id, score in sannots: if score >= threshold: w.write(prot_id + '\t' + go_id + '\t' + go.get_term(go_id)['name'] + '\t%.2f' % go.get_ic(go_id) + '\t%.3f\n' % score) w.write('\n') w.close() total_time = time.time() - start_time print('Total prediction time for %d sequences is %d' % (total_seq, total_time))
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont, alpha): alpha /= 100.0 mp = Ontology('data/mp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['mp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['mp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) mp.calculate_ic(annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # GO2HP preds rules = {} with open('data/go2hp.txt') as f: for line in f: it = line.strip().split('\t') go_id = it[0].replace('_', ':') mp_ids = list(map(lambda x: x.replace('_', ':'), it[1:])) if go_id not in rules: rules[go_id] = [] rules[go_id] = mp_ids pheno2go_preds = {} for i, row in enumerate(test_df.itertuples()): prot_id = row.proteins if prot_id not in pheno2go_preds: pheno2go_preds[prot_id] = {} for item in row.deepgo_annotations: go_id, score = item.split('|') if go_id in rules: for mp_id in rules[go_id]: pheno2go_preds[prot_id][mp_id] = max( float(score), pheno2go_preds[prot_id].get(mp_id, 0)) labels = test_annotations fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 for t in range(101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): prot_id = row.proteins annots_dict = {} #pheno2go_preds[prot_id] for j, score in enumerate(row.preds): mp_id = terms[j] annots_dict[mp_id] = max(score, annots_dict.get(mp_id, 0)) annots = set() for mp_id, score in annots_dict.items(): if score >= threshold: annots.add(mp_id) new_annots = set() for mp_id in annots: new_annots |= mp.get_anchestors(mp_id) preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(mp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf') df = pd.DataFrame({'precisions': precisions, 'recalls': recalls}) df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(hp_file, train_data_file, terms_file, dis_phenotypes, omim_file, predictions_file, gene_annots_file, dis_annots_file, fold): hp = Ontology(hp_file, with_rels=True) print('HP loaded') terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} predictions_file = f'fold{fold}_exp-' + predictions_file gene_annots_file = f'fold{fold}_exp-' + gene_annots_file dis_annots_file = f'fold{fold}_exp-' + dis_annots_file real_annots_file = f'fold{fold}_exp-data/gene_annotations_real.tab' diseases = set() genes = set() with open(omim_file, 'r') as f: for line in f: if line.startswith('#'): continue it = line.strip().split('\t') omim_id = it[0].split(', ')[-1].split()[0] gene_symbols = it[1].split(', ') genes |= set(gene_symbols) diseases.add('OMIM:' + omim_id) print(len(diseases), len(genes)) dis_annots = {} with open(dis_phenotypes) as f: for line in f: it = line.strip().split('\t') dis_id = it[0] + ':' + it[1] if dis_id not in diseases: continue hp_id = it[4] if not hp.has_term(hp_id): continue if dis_id not in dis_annots: dis_annots[dis_id] = set() dis_annots[dis_id].add(hp_id) with open(dis_annots_file, 'w') as w: for dis_id, annots in dis_annots.items(): w.write(dis_id) for hp_id in annots: w.write('\t' + hp_id) w.write('\n') df = pd.read_pickle(predictions_file) with open(gene_annots_file, 'w') as w: for i, row in df.iterrows(): w.write(row['genes']) for hp_id in row['hp_preds']: w.write('\t' + hp_id) w.write('\n') with open(real_annots_file, 'w') as w: for i, row in df.iterrows(): w.write(row['genes']) for hp_id in row['hp_annotations']: w.write('\t' + hp_id) w.write('\n')
def main(go_file, mp_file, mp_annots_file, deepgo_annots_file, id_mapping_file, data_file, out_data_file, out_terms_file, min_count): go = Ontology(go_file, with_rels=True) logging.info('GO loaded') mp = Ontology(mp_file, with_rels=True) logging.info('MP loaded') logging.info('Load MP2Uniprot mapping') prot2gene = {} with open(id_mapping_file) as f: next(f) for line in f: it = line.strip().split('\t') if it[0] not in gene2prot: gene2prot[it[0]] = [] gene2prot[it[0]] += list(it[6].split()) logging.info('Loading MP annotations') mp_annots = {} df = pd.read_pickle(data_file) acc2prot = {} for row in df.itertuples(): p_id = row.proteins acc_ids = row.accessions.split('; ') for acc_id in acc_ids: acc2prot[acc_id] = p_id with open(mp_annots_file) as f: next(f) for line in f: it = line.strip().split('\t') for mgi in it[6].split('|'): if mgi not in gene2prot: continue prot_ids = gene2prot[mgi] mp_id = it[4] for prot_id in prot_ids: if prot_id not in acc2prot: continue prot_id = acc2prot[prot_id] if prot_id not in mp_annots: mp_annots[prot_id] = set() if mp.has_term(mp_id): mp_annots[prot_id] |= mp.get_anchestors(mp_id) print('MP Annotations', len(mp_annots)) dg_annots = {} gos = set() with open(deepgo_annots_file) as f: for line in f: it = line.strip().split('\t') prot_id = it[0] annots = [] for item in it[1:]: go_id, score = item.split('|') score = float(score) annots.append(go_id) dg_annots[prot_id] = it[1:] gos |= set(annots) print('DeepGO Annotations', len(dg_annots)) print('Number of GOs', len(gos)) go_df = pd.DataFrame({'gos': list(gos)}) go_df.to_pickle('data/gos.pkl') logging.info('Processing annotations') cnt = Counter() annotations = list() for prot_id, annots in mp_annots.items(): for term in annots: cnt[term] += 1 deepgo_annots = [] go_annots = [] mpos = [] prots = [] sequences = [] for row in df.itertuples(): p_id = row.proteins if p_id in mp_annots: prots.append(p_id) mpos.append(mp_annots[p_id]) go_annots.append(row.annotations) deepgo_annots.append(dg_annots[p_id]) sequences.append(row.sequences) prots_set = set(prots) for key, val in mp_annots.items(): if key not in prots_set: print(key) df = pd.DataFrame({ 'proteins': prots, 'mp_annotations': mpos, 'go_annotations': go_annots, 'deepgo_annotations': deepgo_annots, 'sequences': sequences }) df.to_pickle(out_data_file) print(f'Number of proteins {len(df)}') # Filter terms with annotations more than min_count res = {} for key, val in cnt.items(): if key == 'MP:0000001': continue if val >= min_count: ont = key.split(':')[0] if ont not in res: res[ont] = [] res[ont].append(key) terms = [] for key, val in res.items(): print(key, len(val)) terms += val logging.info(f'Number of terms {len(terms)}') # Save the list of terms df = pd.DataFrame({'terms': terms}) df.to_pickle(out_terms_file)