def human_go_annotations(): go = get_gene_ontology() annots = {} df = pd.read_pickle('data/cafa3/swissprot_exp.pkl') for i, row in df.iterrows(): acc = row['accessions'] gos = set() for go_id in row['annots']: go_id = go_id.split('|') if go_id[1] in EXP_CODES and go_id[0] in go: gos.add(go_id[0]) if len(gos) > 0: annots[acc] = gos id_df = pd.read_pickle('data/idmapping.9606.pkl') st_ids = dict() for i, row in id_df.iterrows(): if isinstance(row['string'], str): st_ids[row['accessions']] = row['string'] with open('data/human_annotations.tab', 'w') as f: for acc, gos in annots.iteritems(): if acc in st_ids: f.write(st_ids[acc]) for go_id in gos: f.write('\t' + go_id) f.write('\n')
def table(): bp = get_data('bp.res') mf = get_data('mf.res') cc = get_data('cc.res') bp_seq = get_data('bp-seq.res') mf_seq = get_data('mf-seq.res') cc_seq = get_data('cc-seq.res') go = get_gene_ontology('go.obo') gos = go[BIOLOGICAL_PROCESS]['children'] res = list() for go_id in gos: if go_id in bp: res.append((go_id, go[go_id]['name'], bp[go_id][0], bp[go_id][1], bp_seq[go_id][0], bp_seq[go_id][1])) for row in sorted(res, key=lambda x: x[2], reverse=True): print('%s & %s & %f & %f & %f & %f \\\\' % row) gos = go[MOLECULAR_FUNCTION]['children'] print() res = list() for go_id in gos: if go_id in mf: res.append((go_id, go[go_id]['name'], mf[go_id][0], mf[go_id][1], mf_seq[go_id][0], mf_seq[go_id][1])) for row in sorted(res, key=lambda x: x[2], reverse=True): print('%s & %s & %f & %f & %f & %f \\\\' % row) gos = go[CELLULAR_COMPONENT]['children'] print() res = list() for go_id in gos: if go_id in cc: res.append((go_id, go[go_id]['name'], cc[go_id][0], cc[go_id][1], cc_seq[go_id][0], cc_seq[go_id][1])) for row in sorted(res, key=lambda x: x[2], reverse=True): print('%s & %s & %f & %f & %f & %f \\\\' % row)
def main(function, device, org, train): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): model(is_train=train)
def main(split): global SPLIT SPLIT = split global GO_IDS GO_IDS = list(FUNC_DICT.values()) global go go = get_gene_ontology('go.obo') func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl') global functions functions = func_df['functions'].values func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl') functions = np.concatenate((functions, func_df['functions'].values)) func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl') functions = np.concatenate((functions, func_df['functions'].values)) global func_set func_set = ( get_go_set(go, GO_IDS[0]) | get_go_set(go, GO_IDS[1]) | get_go_set(go, GO_IDS[2])) print(len(functions)) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind run()
def get_annotations(): gene_ontology = get_gene_ontology() annots = dict() gene_name = dict() with open(DATA_ROOT + 'gene_association.sgd', 'r') as f: for line in f: if line[0] == '!': continue items = line.strip().split('\t') if items[3] == 'NOT' or items[6] == 'ND': continue gene_id = items[1] gene_name[gene_id] = items[2] go = items[4] if gene_id not in annots: annots[gene_id] = set() if go in gene_ontology: annots[gene_id].add(go) groups = dict() for gene_id, gos in annots.iteritems(): l = len(gos) if l not in groups: groups[l] = list() groups[l].append((list(gos), gene_id)) with open(DATA_ROOT + 'sgd_annotations_genes2.txt', 'w') as f: for group in sorted(groups.keys()): gos_list = groups[group] print group for gos, gene_id in gos_list: f.write(gene_name[gene_id] + '\t') f.write(gos[0]) for go in gos[1:]: f.write('\t' + go) f.write('\n')
def main(function, device, org, train, param, embeddingmethod, shuffleseed, buildmethod, evomodel, cached): global CACHED CACHED = cached global BUILDMETHOD BUILDMETHOD = buildmethod global EVOMODEL EVOMODEL = evomodel global EMBEDDINGMETHOD EMBEDDINGMETHOD = embeddingmethod global SEED SEED = shuffleseed global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) global experiment_id experiment_id = str(function) + '-' + str(embeddingmethod) + '-' + str( shuffleseed) + '-' + str(buildmethod) + '-' + str(evomodel) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) a = experiment_id global resdir resdir = "results/" + experiment_id if not os.path.isdir(resdir): os.mkdir(resdir) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 3, 'nb_dense': 2, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } # model(params, is_train=train) model(params, is_train=True)
def compute_performance(func): go = get_gene_ontology() train_df = pd.read_pickle('data/swissexp/train-' + func + '.pkl') test_df = pd.read_pickle('data/swissexp/test-' + func + '.pkl') train_labels = {} test_labels = {} for i, row in train_df.iterrows(): go_set = set() for go_id in row['gos']: if go_id in go: go_set |= get_anchestors(go, go_id) train_labels[row['proteins']] = row['labels'] for i, row in test_df.iterrows(): go_set = set() for go_id in row['gos']: if go_id in go: go_set |= get_anchestors(go, go_id) test_labels[row['proteins']] = row['labels'] preds = list() test = list() with open('data/swissexp/blast-' + func + '.res') as f: for line in f: it = line.strip().split('\t') preds.append(train_labels[it[1]]) test.append(test_labels[it[0]]) total = 0 p = 0.0 r = 0.0 f = 0.0 p_total = 0 for label, pred in zip(test, preds): tp = np.sum(label * pred) fp = np.sum(pred) - tp fn = np.sum(label) - tp # tp = len(label.intersection(pred)) # fp = len(pred) - tp # fn = len(label) - tp if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: p_total += 1 precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall p /= p_total r /= total f = 2 * p * r / (p + r) return f, p, r
def main(function, device, org, train, param): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 3, 'nb_dense': 2, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } model(params, is_train=train) dims = [64, 128, 256, 512] nb_filters = [16, 32, 64, 128] nb_convs = [1, 2, 3, 4] nb_dense = [1, 2, 3, 4] for i in range(param * 32, param * 32 + 32): dim = i % 4 i = i / 4 nb_fil = i % 4 i /= 4 conv = i % 4 i /= 4 den = i params['embedding_dims'] = dims[dim] params['nb_filter'] = nb_filters[nb_fil] params['nb_conv'] = nb_convs[conv] params['nb_dense'] = nb_dense[den]
def get_real_annotations(): go = get_gene_ontology() df = pd.read_pickle('data/cafa3/swissprot_exp.pkl') annots = {} for i, row in df.iterrows(): go_set = set() for go_id in row['annots']: go_id = go_id.split('|') if go_id[0] in go and go_id[1] in EXP_CODES: go_set |= get_anchestors(go, go_id[0]) annots[row['proteins']] = go_set return annots
def main(function, test_df, device): org = None param = 0 filename = 'ResultSequenceStructPPI.txt' train = False global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) global go_indexes go_indexes = dict() #will be used for my prediction list indexes_for_prediction = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind indexes_for_prediction[ind] = go_id global node_names global FILENAME FILENAME = filename global PARAMS node_names = set() global prediction_list with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 1, 'nb_dense': 1, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } PARAMS = params prediction_list = model(params, test_df, is_train=train) return prediction_list
def main(data_root, go_filename, go_domain, split): global DATA_ROOT DATA_ROOT = data_root global go go = get_gene_ontology() global FUNCTION FUNCTION = go_domain df = pd.read_pickle(DATA_ROOT + go_domain + '.pkl') global functions functions = list(df['functions']) global func_set func_set = set(functions) global GO_ID GO_ID = FUNC_DICT[FUNCTION] dataset = load_data(split=split) train_model(dataset)
def specific_predictions(): root = 'data/cafa3/' go = get_gene_ontology() fw = open(root + 'test_predictions_specific.tab', 'w') with open(root + 'test_predictions.tab', 'r') as f: for line in f: items = line.strip().split('\t') go_set = set(items[1:]) gos = go_set.copy() for go_id in gos: anchestors = get_anchestors(go, go_id) anchestors.remove(go_id) go_set -= anchestors fw.write(items[0]) for go_id in go_set: fw.write('\t' + go_id) fw.write('\n') fw.close()
def main(function, device, org, train, param, filename): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info('Functions: %s %d' % (FUNCTION, len(functions))) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names global FILENAME FILENAME = filename global PARAMS node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'nb_conv': 1, 'nb_dense': 1, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } PARAMS = params model(params, is_train=train)
def main(device, org, train): global GO_IDS GO_IDS = FUNC_DICT.values() global go go = get_gene_ontology('go.obo') global ORG ORG = org func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl') global functions functions = func_df['functions'].values func_df = pd.read_pickle(DATA_ROOT + 'mf.pkl') functions = np.concatenate((functions, func_df['functions'].values)) func_df = pd.read_pickle(DATA_ROOT + 'cc.pkl') functions = np.concatenate((functions, func_df['functions'].values)) global func_set func_set = set(functions) global all_functions all_functions = ( get_go_set(go, GO_IDS[0]) | get_go_set(go, GO_IDS[1]) | get_go_set(go, GO_IDS[2])) logging.info('Functions: %d' % (len(functions), )) if ORG is not None: logging.info('Organism %s' % ORG) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind global node_names node_names = set() with tf.device('/' + device): params = { 'fc_output': 1024, 'learning_rate': 0.001, 'embedding_dims': 128, 'embedding_dropout': 0.2, 'filter_length': 128, 'nb_filter': 32, 'pool_length': 64, 'stride': 32 } model(params, is_train=train)
def main(function, split): global SPLIT SPLIT = split global GO_ID GO_ID = FUNC_DICT[function] global go go = get_gene_ontology('go.obo') global FUNCTION FUNCTION = function func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = get_go_set(go, GO_ID) print len(functions) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind run()
def main(function, device, model_name): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') func_df = pd.read_pickle(DATA_ROOT + FUNCTION + '.pkl') global functions functions = func_df['functions'].values global func_set func_set = set(functions) global all_functions all_functions = get_go_set(go, GO_ID) logging.info(len(functions)) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind with tf.device('/' + device): model(model_name)
def main(function, annot_num): global FUNCTION FUNCTION = function global GO_ID GO_ID = FUNC_DICT[FUNCTION] global go go = get_gene_ontology('go.obo') global functions functions = deque() dfs(GO_ID) functions.remove(GO_ID) functions = list(functions) print((len(functions))) global func_set func_set = set(functions) global go_indexes go_indexes = dict() for ind, go_id in enumerate(functions): go_indexes[go_id] = ind get_functions(annot_num)
def compute_performance(): root = 'data/cafa3/' preds = {} annots = {} go = get_gene_ontology() with open(root + 'test_predictions.tab', 'r') as f: for line in f: items = line.strip().split('\t') preds[items[0]] = set(items[1:]) with open(root + 'test_annotations.tab', 'r') as f: for line in f: items = line.strip().split('\t') annots[items[0]] = set() for go_id in items[1:]: if go_id in go: annots[items[0]] |= get_anchestors(go, go_id) total = 0 p = 0.0 r = 0.0 f = 0.0 for prot, pred_annots in preds.iteritems(): real_annots = annots[prot] if len(real_annots) == 0: continue tp = len(real_annots.intersection(pred_annots)) fp = len(pred_annots - real_annots) fn = len(real_annots - pred_annots) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall f += 2 * precision * recall / (precision + recall) print(f / total, p / total, r / total)
Dense, Dropout, Activation, Flatten) from keras.layers.convolutional import Convolution1D, MaxPooling1D from keras.layers.embeddings import Embedding from keras.optimizers import SGD from sklearn.metrics import classification_report from keras.utils import np_utils from utils import ( shuffle, train_val_test_split, get_gene_ontology) import sys import os from collections import deque LAMBDA = 24 DATA_ROOT = 'data/molecular_functions/paac/' go = get_gene_ontology() go_model = dict() def load_data(go_id): pass def get_model( go_id, max_features=10000, embedding_dims=100, nb_filters=250, hidden_dims=250, pool_length=2, filter_length=3):
import pandas as pd from keras.models import Sequential, Model from keras.layers import (Dense, Dropout, Activation, Input, Flatten, merge) from keras.layers.embeddings import Embedding from keras.layers.convolutional import (Convolution1D, MaxPooling1D) from sklearn.metrics import classification_report from utils import (shuffle, get_gene_ontology) from keras.callbacks import ModelCheckpoint, EarlyStopping import sys from aaindex import (AAINDEX) from collections import deque import pdb DATA_ROOT = 'yeast/' MAXLEN = 500 go = get_gene_ontology('goslim_yeast.obo') def get_go_set(go_id): go_set = set() q = deque() q.append(go_id) while len(q) > 0: g_id = q.popleft() go_set.add(g_id) for ch_id in go[g_id]['children']: q.append(ch_id) return go_set functions = get_go_set('GO:0003674')
def main(function): global go go = get_gene_ontology() func_df = pd.read_pickle(DATA_ROOT + function + '.pkl') global functions functions = func_df['functions'].values func_index = dict() for i, go_id in enumerate(functions): func_index[go_id] = i global func_set func_set = set(func_index) global GO_ID GO_ID = FUNC_DICT[function] global all_functions all_functions = get_go_set(go, GO_ID) pred_df = pd.read_pickle(DATA_ROOT + 'model_preds_' + function + '.pkl') # FFPred preds preds_dict = {} # files = os.listdir('data/ffpred/') # for fl in files: # with open('data/gofdr/predictions.tab') as f: # for line in f: # it = line.strip().split('\t') # target_id = it[0] # if function[1].upper() != it[2]: # continue # if target_id not in preds_dict: # preds_dict[target_id] = list() # preds_dict[target_id].append((it[1], float(it[3]))) # print(len(preds_dict)) target_ids = list() predictions = list() for key, val in preds_dict.items(): target_ids.append(key) predictions.append(val) # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions}) targets = dict() with open('data/cafa3/CAFA3_benchmark20170605/groundtruth/leafonly_' + function.upper() + 'O_unique.txt') as f: for line in f: it = line.strip().split('\t') target = it[0] go_id = it[1] if target not in targets: targets[target] = list() targets[target].append(go_id) target_ids = list() labels = list() go_ids = list() for target, gos in targets.items(): go_set = set() for go_id in gos: if go_id in all_functions: go_set |= get_anchestors(go, go_id) label = np.zeros((len(functions), ), dtype=np.int32) for go_id in go_set: if go_id in func_index: label[func_index[go_id]] = 1 target_ids.append(target) go_ids.append(go_set) labels.append(label) df = pd.DataFrame({'targets': target_ids, 'gos': go_ids, 'labels': labels}) df = pd.merge(df, pred_df, on='targets', how='inner') df.to_pickle(DATA_ROOT + 'model_preds_filtered_' + function + '.pkl') def reshape(values): values = np.hstack(values).reshape(len(values), len(values[0])) return values preds = reshape(df['predictions'].values) labels = reshape(df['labels'].values) # preds = df['predictions'].values gos = df['gos'].values f, p, r, t, preds_max = compute_performance(preds, labels, gos) print(f, p, r) # labels = list() # scores = list() # for i in range(len(preds)): # all_gos = set() # for go_id in gos[i]: # if go_id in all_functions: # all_gos |= get_anchestors(go, go_id) # all_gos.discard(GO_ID) # scores_dict = {} # for val in preds[i]: # go_id, score = val # if go_id in all_functions: # go_set = get_anchestors(go, go_id) # for g_id in go_set: # if g_id not in scores_dict or scores_dict[g_id] < score: # scores_dict[g_id] = score # all_preds = set(scores_dict) # | all_gos # all_preds.discard(GO_ID) # for go_id in all_preds: # if go_id in scores_dict: # scores.append(scores_dict[go_id]) # else: # scores.append(0) # if go_id in all_gos: # labels.append(1) # else: # labels.append(0) # scores = np.array(scores) # labels = np.array(labels) roc_auc = compute_roc(preds, labels) print(roc_auc) # preds_max = (scores > t).astype(np.int32) mcc = compute_mcc(preds_max, labels) print(mcc)
from keras.layers.convolutional import ( Convolution1D, MaxPooling1D) from sklearn.metrics import classification_report from utils import ( shuffle, get_gene_ontology) from keras.callbacks import ModelCheckpoint, EarlyStopping import sys from aaindex import ( AAINDEX) from collections import deque import pdb DATA_ROOT = 'yeast/' MAXLEN = 500 go = get_gene_ontology('goslim_yeast.obo') def get_go_set(go_id): go_set = set() q = deque() q.append(go_id) while len(q) > 0: g_id = q.popleft() go_set.add(g_id) for ch_id in go[g_id]['children']: q.append(ch_id) return go_set functions = get_go_set('GO:0003674') functions.remove('GO:0003674')
get_gene_ontology ) import os import sys import pdb from keras.optimizers import Adam import shutil from collections import deque import pandas as pd LAMBDA = 24 DATA_ROOT = 'data/fofe/' CUR_LEVEL = 'level_1/' NEXT_LEVEL = 'level_2/' go = get_gene_ontology() go_model = dict() MAXLEN = 500 def get_gos_by_prot_id(): data = dict() with open(DATA_ROOT + 'train.txt', 'r') as f: prot_id = 0 for line in f: line = line.strip().split('\t') gos = line[2].split('; ') go_set = set() for go_id in gos: go_set.add(go_id)
#!/usr/bin/env python import sys import numpy as np import pandas as pd from keras.utils import np_utils from utils import get_gene_ontology from collections import deque DATA_ROOT = 'data/fofe/' FILENAME = 'train.txt' go = get_gene_ontology('go.obo') def get_go_set(go_id): go_set = set() q = deque() q.append(go_id) while len(q) > 0: g_id = q.popleft() go_set.add(g_id) for ch_id in go[g_id]['children']: q.append(ch_id) return go_set functions = get_go_set('GO:0003674') def get_anchestors(go_id): go_set = set()
#!/usr/bin/env python import sys import numpy as np import pandas as pd from keras.utils import np_utils from utils import get_gene_ontology from collections import deque DATA_ROOT = "data/fofe/" FILENAME = "train.txt" go = get_gene_ontology("go.obo") def get_go_set(go_id): go_set = set() q = deque() q.append(go_id) while len(q) > 0: g_id = q.popleft() go_set.add(g_id) for ch_id in go[g_id]["children"]: q.append(ch_id) return go_set functions = get_go_set("GO:0003674")
def get_predictions(): root = 'data/cafa3/' annots = {} preds = {} go = get_gene_ontology() mf = pd.read_pickle(root + 'mf.pkl') mf_df = pd.read_pickle(root + 'test-mf-preds.pkl') functions = mf['functions'] for i, row in mf_df.iterrows(): prot_id = row['proteins'] if prot_id not in preds: preds[prot_id] = set() for i in xrange(len(functions)): if row['predictions'][i] == 1: preds[prot_id].add(functions[i]) if prot_id not in annots: annots[prot_id] = row['gos'] cc = pd.read_pickle(root + 'cc.pkl') cc_df = pd.read_pickle(root + 'test-cc-preds.pkl') functions = cc['functions'] for i, row in cc_df.iterrows(): prot_id = row['proteins'] if prot_id not in preds: preds[prot_id] = set() for i in xrange(len(functions)): if row['predictions'][i] == 1: preds[prot_id].add(functions[i]) if prot_id not in annots: annots[prot_id] = row['gos'] bp = pd.read_pickle(root + 'bp.pkl') bp_df = pd.read_pickle(root + 'test-bp-preds.pkl') functions = bp['functions'] for i, row in bp_df.iterrows(): prot_id = row['proteins'] if prot_id not in preds: preds[prot_id] = set() for i in xrange(len(functions)): if row['predictions'][i] == 1: preds[prot_id].add(functions[i]) if prot_id not in annots: annots[prot_id] = row['gos'] # Removing parent classes for prot_id in preds: go_set = preds[prot_id] gos = go_set.copy() for go_id in gos: anchestors = get_anchestors(go, go_id) anchestors.remove(go_id) go_set -= anchestors proteins = sorted(annots.keys(), key=lambda x: (x.split('_')[1], x.split('_')[0])) with open(root + 'test_predictions.tab', 'w') as f: for prot_id in proteins: f.write(prot_id) for go_id in preds[prot_id]: f.write('\t' + go_id) f.write('\n') with open(root + 'test_annotations.tab', 'w') as f: for prot_id in proteins: f.write(prot_id) for go_id in annots[prot_id]: if go_id in go: f.write('\t' + go_id) f.write('\n')