def load_data(): df = get_data() functions = list() phenotypes = list() n = len(df) train_n = int(0.8 * n) index = np.arange(n) np.random.seed(seed=0) np.random.shuffle(index) train_df = df.loc[index[:train_n]] test_df = df.loc[index[train_n:]] test_df.to_pickle('data/test_data.pkl') for i, row in train_df.iterrows(): funcs = set() phenos = set() for func in row['functions']: funcs |= get_anchestors(go, func) for pheno in row['phenotypes']: phenos |= get_anchestors(hp, pheno) phenos.discard('HP:0000001') funcs.discard(MOLECULAR_FUNCTION) funcs.discard(BIOLOGICAL_PROCESS) funcs.discard(CELLULAR_COMPONENT) functions.append(funcs) phenotypes.append(phenos) return functions, phenotypes
def compute_performance(func): go = get_gene_ontology() train_df = pd.read_pickle('data/swissexp/train-' + func + '.pkl') test_df = pd.read_pickle('data/swissexp/test-' + func + '.pkl') train_labels = {} test_labels = {} for i, row in train_df.iterrows(): go_set = set() for go_id in row['gos']: if go_id in go: go_set |= get_anchestors(go, go_id) train_labels[row['proteins']] = row['labels'] for i, row in test_df.iterrows(): go_set = set() for go_id in row['gos']: if go_id in go: go_set |= get_anchestors(go, go_id) test_labels[row['proteins']] = row['labels'] preds = list() test = list() with open('data/swissexp/blast-' + func + '.res') as f: for line in f: it = line.strip().split('\t') preds.append(train_labels[it[1]]) test.append(test_labels[it[0]]) total = 0 p = 0.0 r = 0.0 f = 0.0 p_total = 0 for label, pred in zip(test, preds): tp = np.sum(label * pred) fp = np.sum(pred) - tp fn = np.sum(label) - tp # tp = len(label.intersection(pred)) # fp = len(pred) - tp # fn = len(label) - tp if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: p_total += 1 precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall p /= p_total r /= total f = 2 * p * r / (p + r) return f, p, r
def deepgo_stats(): df = pd.read_pickle('data/bp.pkl') functions = set(df['functions'].values) n = 0 rules = set() with open('data/rules_prop.txt') as f: for line in f: it = line.strip().split('\t') rules.add(it[0].replace('_', ':')) print('Functions: ', len(functions)) print('Rules: ', len(rules)) inter = functions.intersection(rules) with open('data/overlap.txt', 'w') as f: for go_id in inter: f.write(go_id + '\n') print('Overlap: ', len(inter)) go = get_ontology('data/go.obo') for go_id in list(inter): inter |= get_anchestors(go, go_id) print(len(inter)) res = list() for func in functions: if func in inter: res.append(func) print(len(res)) df = pd.DataFrame({'functions': res}) df.to_pickle('data/phenogo.pkl')
def load_data(): ngram_df = pd.read_pickle(DATA_ROOT + 'ngrams.pkl') vocab = {} for key, gram in enumerate(ngram_df['ngrams']): vocab[gram] = key + 1 gram_len = len(ngram_df['ngrams'][0]) print('Gram length:', gram_len) print('Vocabulary size:', len(vocab)) proteins = list() gos = list() labels = list() ngrams = list() sequences = list() accessions = list() df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl') # Filtering data by sequences index = list() for i, row in df.iterrows(): if is_ok(row['sequences']): index.append(i) df = df.loc[index] for i, row in df.iterrows(): go_list = [] for item in row['annots']: items = item.split('|') if items[1] in EXP_CODES: go_list.append(items[0]) # go_list.append(items[0]) go_set = set() for go_id in go_list: if go_id in func_set: go_set |= get_anchestors(go, go_id) if not go_set or GO_ID not in go_set: continue go_set.remove(GO_ID) gos.append(go_list) proteins.append(row['proteins']) accessions.append(row['accessions']) seq = row['sequences'] sequences.append(seq) grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32') for i in xrange(len(seq) - gram_len + 1): grams[i] = vocab[seq[i:(i + gram_len)]] ngrams.append(grams) label = np.zeros((len(functions), ), dtype='int32') for go_id in go_set: if go_id in go_indexes: label[go_indexes[go_id]] = 1 labels.append(label) res_df = pd.DataFrame({ 'accessions': accessions, 'proteins': proteins, 'ngrams': ngrams, 'labels': labels, 'gos': gos, 'sequences': sequences }) print(len(res_df)) return res_df
def get_functions(annot_num): df = pd.read_pickle(DATA_ROOT + 'miR2GO_nonIEA_GOA_20180617.pkl') annots = dict() for i, row in df.iterrows(): go_set = set() #if not is_ok(row['sequences']): #if no sequences, then continue # continue for go_id in row['gos']: #add labels #go_id = go_id.split('|') #if go_id[1] not in EXP_CODES: # continue #go_id = go_id[0] if go_id in func_set: go_set |= get_anchestors(go, go_id) for go_id in go_set: if go_id not in annots: annots[go_id] = 0 annots[go_id] += 1 filtered = list() for go_id in functions: if go_id in annots and annots[go_id] >= annot_num: filtered.append(go_id) print len(filtered) df = pd.DataFrame({'functions': filtered}) df.to_pickle(TEST_DATA_ROOT + FUNCTION + '.pkl') print 'Saved ' + TEST_DATA_ROOT + FUNCTION + '.pkl'
def get_functions(annot_num): df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl') annots = dict() for i, row in df.iterrows(): go_set = set() if not is_ok(row['sequences']): continue for go_id in row['annots']: go_id = go_id.split('|') if go_id[1] not in EXP_CODES: continue go_id = go_id[0] if go_id in func_set: go_set |= get_anchestors(go, go_id) for go_id in go_set: if go_id not in annots: annots[go_id] = 0 annots[go_id] += 1 filtered = list() for go_id in functions: if go_id in annots and annots[go_id] >= annot_num: filtered.append(go_id) print(len(filtered)) df = pd.DataFrame({'functions': filtered}) df.to_pickle(DATA_ROOT + FUNCTION + '.pkl') print('Saved ' + DATA_ROOT + FUNCTION + '.pkl')
def compute_performance(preds, labels, gos): # preds = np.round(preds, decimals=2) f_max = 0 p_max = 0 r_max = 0 t_max = 0 for t in range(1, 100): threshold = t / 100.0 predictions = (preds > threshold).astype(np.int32) # predictions = list() total = 0 f = 0.0 p = 0.0 r = 0.0 p_total = 0 for i in range(preds.shape[0]): tp = np.sum(predictions[i, :] * labels[i, :]) fp = np.sum(predictions[i, :]) - tp fn = np.sum(labels[i, :]) - tp all_gos = set() all_preds = set() for go_id in gos[i]: if go_id in all_functions: all_gos |= get_anchestors(go, go_id) all_gos.discard(GO_ID) # for val in preds[i]: # go_id, score = val # if score > threshold and go_id in all_functions: # all_preds |= get_anchestors(go, go_id) # all_preds.discard(GO_ID) # predictions.append(all_preds) # tp = len(all_gos.intersection(all_preds)) # fp = len(all_preds) - tp # fn = len(all_gos) - tp all_gos -= func_set fn += len(all_gos) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: p_total += 1 precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall if total > 0 and p_total > 0: r /= total p /= p_total if p + r > 0: f = 2 * p * r / (p + r) if f_max < f: f_max = f p_max = p r_max = r t_max = threshold predictions_max = predictions return f_max, p_max, r_max, t_max, predictions_max
def performanc_by_interpro(): pred_df = pd.read_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl') ipro_df = load_prot_ipro() df = pred_df.merge(ipro_df, on='proteins', how='left') ipro = get_ipro() def reshape(values): values = np.hstack(values).reshape( len(values), len(values[0])) return values for ipro_id in ipro: if len(ipro[ipro_id]['parents']) > 0: continue labels = list() predictions = list() gos = list() for i, row in df.iterrows(): if not isinstance(row['ipros'], list): continue if ipro_id in row['ipros']: labels.append(row['labels']) predictions.append(row['predictions']) gos.append(row['gos']) pr = 0 rc = 0 total = 0 p_total = 0 for i in range(len(labels)): tp = np.sum(labels[i] * predictions[i]) fp = np.sum(predictions[i]) - tp fn = np.sum(labels[i]) - tp all_gos = set() for go_id in gos[i]: if go_id in all_functions: all_gos |= get_anchestors(go, go_id) all_gos.discard(GO_ID) all_gos -= func_set fn += len(all_gos) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: p_total += 1 precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) pr += precision rc += recall if total > 0 and p_total > 0: rc /= total pr /= p_total if pr + rc > 0: f = 2 * pr * rc / (pr + rc) logging.info('%s\t%d\t%f\t%f\t%f' % ( ipro_id, len(labels), f, pr, rc))
def get_real_annotations(): go = get_gene_ontology() df = pd.read_pickle('data/cafa3/swissprot_exp.pkl') annots = {} for i, row in df.iterrows(): go_set = set() for go_id in row['annots']: go_id = go_id.split('|') if go_id[0] in go and go_id[1] in EXP_CODES: go_set |= get_anchestors(go, go_id[0]) annots[row['proteins']] = go_set return annots
def model(model_name): # set parameters: batch_size = 128 nb_classes = len(functions) start_time = time.time() logging.info("Loading Data") data, targets = load_data() data_generator = DataGenerator(batch_size, nb_classes) data_generator.fit(data, None) logging.info("Data loaded in %d sec" % (time.time() - start_time)) logging.info("Data size: %d" % len(data[0])) logging.info('Loading the model') with open(DATA_ROOT + model_name + '_' + FUNCTION + '.json', 'r') as f: json_string = next(f) model = model_from_json(json_string) optimizer = RMSprop() model.compile(optimizer=optimizer, loss='binary_crossentropy') model_path = DATA_ROOT + model_name + '_weights_' + FUNCTION + '.pkl' logging.info('Compilation finished in %d sec' % (time.time() - start_time)) logging.info('Loading weights') load_model_weights(model, model_path) logging.info('Predicting') preds = model.predict_generator(data_generator, val_samples=len(data[0]), nb_worker=12) for i in xrange(len(preds)): preds[i] = preds[i].reshape(-1, 1) preds = np.concatenate(preds, axis=1) incon = 0 for i in xrange(len(data)): for j in xrange(len(functions)): anchestors = get_anchestors(go, functions[j]) for p_id in anchestors: if (p_id not in [GO_ID, functions[j]] and preds[i, go_indexes[p_id]] < preds[i, j]): incon += 1 preds[i, go_indexes[p_id]] = preds[i, j] logging.info('Inconsistent predictions: %d' % incon) predictions = list() for i in xrange(len(targets)): predictions.append(preds[i]) df = pd.DataFrame({'targets': targets, 'predictions': predictions}) print(len(df)) df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl') logging.info('Done in %d sec' % (time.time() - start_time))
def compute_performance(preds, labels, gos): #fw = open(TEST_DATA_ROOT + 'pred_miR-' + FUNCTION + '-line_emb_s100_n10_512-p_r.txt','a+') preds = np.round(preds, 2) f_max = 0 p_max = 0 r_max = 0 t_max = 0 for t in xrange(1, 100): threshold = t / 100.0 predictions = (preds > threshold).astype(np.int32) total = 0 f = 0.0 p = 0.0 r = 0.0 p_total = 0 for i in range(labels.shape[0]): tp = np.sum(predictions[i, :] * labels[i, :]) fp = np.sum(predictions[i, :]) - tp fn = np.sum(labels[i, :]) - tp all_gos = set() for go_id in gos[i]: if go_id in all_functions: all_gos |= get_anchestors(go, go_id) all_gos.discard(GO_ID) all_gos -= func_set fn += len(all_gos) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: p_total += 1 precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall if p_total == 0: continue r /= total p /= p_total #fw.write(str(p) + '\t' + str(r) + '\t' + str(p_total) + '\t' + str(total) + '\n') if p + r > 0: f = 2 * p * r / (p + r) if f_max < f: f_max = f p_max = p r_max = r t_max = threshold predictions_max = predictions #fw.close() return f_max, p_max, r_max, t_max, predictions_max
def compute_similarity_performance(train_df, test_df, preds): logging.info("Computing similarity performance") logging.info("Training data size %d" % len(train_df)) train_labels = train_df['labels'].values train_gos = train_df['gos'].values global labels_gos labels_gos = list(zip(train_labels, train_gos)) p = Pool(64) pred_gos = p.map(get_gos, preds) total = 0 p = 0.0 r = 0.0 f = 0.0 test_gos = test_df['gos'].values for gos, tgos in zip(pred_gos, test_gos): preds = set() test = set() for go_id in gos: if go_id in all_functions: preds |= get_anchestors(go, go_id) for go_id in tgos: if go_id in all_functions: test |= get_anchestors(go, go_id) tp = len(preds.intersection(test)) fp = len(preds - test) fn = len(test - preds) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall f += 2 * precision * recall / (precision + recall) return f / total, p / total, r / total
def compute_performance(preds, labels, gos): preds = np.round(preds, 2) f_max = 0 p_max = 0 r_max = 0 t_max = 0 for t in range(1, 100): threshold = t / 100.0 predictions = (preds > threshold).astype(np.int32) total = 0 f = 0.0 p = 0.0 r = 0.0 p_total = 0 for i in range(labels.shape[0]): tp = np.sum(predictions[i, :] * labels[i, :]) fp = np.sum(predictions[i, :]) - tp fn = np.sum(labels[i, :]) - tp all_gos = set() for go_id in gos[i]: if go_id in all_functions: all_gos |= get_anchestors(go, go_id) for g_id in GO_IDS: all_gos.discard(g_id) all_gos -= func_set fn += len(all_gos) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: p_total += 1 precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall if p_total == 0: continue r /= total p /= p_total if p + r > 0: f = 2 * p * r / (p + r) if f_max < f: f_max = f p_max = p r_max = r t_max = threshold predictions_max = predictions return f_max, p_max, r_max, t_max, predictions_max
def specific_predictions(): root = 'data/cafa3/' go = get_gene_ontology() fw = open(root + 'test_predictions_specific.tab', 'w') with open(root + 'test_predictions.tab', 'r') as f: for line in f: items = line.strip().split('\t') go_set = set(items[1:]) gos = go_set.copy() for go_id in gos: anchestors = get_anchestors(go, go_id) anchestors.remove(go_id) go_set -= anchestors fw.write(items[0]) for go_id in go_set: fw.write('\t' + go_id) fw.write('\n') fw.close()
def load_scores(): scores = dict() with open('data/cosine.out') as f: for line in f: it = line.strip().split() prot = it[0].strip('()\',') go_id = it[1].strip('()\',').upper() score = float(it[2].strip('()\',')) if prot not in scores: scores[prot] = {} if go_id in go: gos = get_anchestors(go, go_id) gos.add(go_id) for g_id in gos: if g_id not in scores[prot]: scores[prot][g_id] = score else: scores[prot][g_id] = max(scores[prot][g_id], score) return scores
def load_annotations(): mapping = load_mapping() annots = dict() with open('data/goa_human.gaf') as f: for line in f: if line.startswith('!'): continue it = line.strip().split('\t') ac = it[1] if it[3] == 'NOT' or it[6] not in EXP_CODES: continue go_id = it[4] if ac not in mapping: continue prot = mapping[ac] if prot not in annots: annots[prot] = set() if go_id in go: annots[prot].add(go_id) annots[prot] |= get_anchestors(go, go_id) return annots
def compute_performance(): root = 'data/cafa3/' preds = {} annots = {} go = get_gene_ontology() with open(root + 'test_predictions.tab', 'r') as f: for line in f: items = line.strip().split('\t') preds[items[0]] = set(items[1:]) with open(root + 'test_annotations.tab', 'r') as f: for line in f: items = line.strip().split('\t') annots[items[0]] = set() for go_id in items[1:]: if go_id in go: annots[items[0]] |= get_anchestors(go, go_id) total = 0 p = 0.0 r = 0.0 f = 0.0 for prot, pred_annots in preds.iteritems(): real_annots = annots[prot] if len(real_annots) == 0: continue tp = len(real_annots.intersection(pred_annots)) fp = len(pred_annots - real_annots) fn = len(real_annots - pred_annots) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) p += precision r += recall f += 2 * precision * recall / (precision + recall) print(f / total, p / total, r / total)
def run(): functions, phenotypes = load_data() terms = list() n = len(functions) global counter counter = Counter() global tree tree = dict() e = 100 term_index = dict() term_list = list() for go_id in go: term_index[go_id] = len(term_index) term_list.append(go_id) for hp_id in hp: term_index[hp_id] = len(term_index) term_list.append(hp_id) for i in xrange(n): funcs = set(map(lambda x: term_index[x], functions[i])) phenos = set(map(lambda x: term_index[x], phenotypes[i])) terms.append(funcs | phenos) for func in funcs: for pheno in phenos: counter[frozenset([func, pheno])] += 1 for s, c in counter.items(): if c < e: del counter[s] for s, c in counter.items(): for term in s: if term_list[term] in go: tree[term] = set( map(lambda x: term_index[x], get_anchestors(go, term_list[term]))) tree[term] |= set( map(lambda x: term_index[x], get_subset(go, term_list[term]))) else: tree[term] = set( map(lambda x: term_index[x], get_anchestors(hp, term_list[term]))) tree[term] |= set( map(lambda x: term_index[x], get_subset(hp, term_list[term]))) print(len(counter)) pool = Pool(48) gf = gzip.open('data/results.gz', 'w') while len(counter) > 0: cnts = pool.map(next_level, terms) cnt = sum(cnts) print(counter.most_common(10)) print(cnt.most_common(10)) for s, c in cnt.items(): if c < e: del cnt[s] else: gf.write(c) for term in s: gf.write('\t' + term_list[term]) gf.write('\n') counter = cnt
def get_predictions(): root = 'data/cafa3/' annots = {} preds = {} go = get_gene_ontology() mf = pd.read_pickle(root + 'mf.pkl') mf_df = pd.read_pickle(root + 'test-mf-preds.pkl') functions = mf['functions'] for i, row in mf_df.iterrows(): prot_id = row['proteins'] if prot_id not in preds: preds[prot_id] = set() for i in xrange(len(functions)): if row['predictions'][i] == 1: preds[prot_id].add(functions[i]) if prot_id not in annots: annots[prot_id] = row['gos'] cc = pd.read_pickle(root + 'cc.pkl') cc_df = pd.read_pickle(root + 'test-cc-preds.pkl') functions = cc['functions'] for i, row in cc_df.iterrows(): prot_id = row['proteins'] if prot_id not in preds: preds[prot_id] = set() for i in xrange(len(functions)): if row['predictions'][i] == 1: preds[prot_id].add(functions[i]) if prot_id not in annots: annots[prot_id] = row['gos'] bp = pd.read_pickle(root + 'bp.pkl') bp_df = pd.read_pickle(root + 'test-bp-preds.pkl') functions = bp['functions'] for i, row in bp_df.iterrows(): prot_id = row['proteins'] if prot_id not in preds: preds[prot_id] = set() for i in xrange(len(functions)): if row['predictions'][i] == 1: preds[prot_id].add(functions[i]) if prot_id not in annots: annots[prot_id] = row['gos'] # Removing parent classes for prot_id in preds: go_set = preds[prot_id] gos = go_set.copy() for go_id in gos: anchestors = get_anchestors(go, go_id) anchestors.remove(go_id) go_set -= anchestors proteins = sorted(annots.keys(), key=lambda x: (x.split('_')[1], x.split('_')[0])) with open(root + 'test_predictions.tab', 'w') as f: for prot_id in proteins: f.write(prot_id) for go_id in preds[prot_id]: f.write('\t' + go_id) f.write('\n') with open(root + 'test_annotations.tab', 'w') as f: for prot_id in proteins: f.write(prot_id) for go_id in annots[prot_id]: if go_id in go: f.write('\t' + go_id) f.write('\n')
def main(function): global go go = get_gene_ontology() func_df = pd.read_pickle(DATA_ROOT + function + '.pkl') global functions functions = func_df['functions'].values func_index = dict() for i, go_id in enumerate(functions): func_index[go_id] = i global func_set func_set = set(func_index) global GO_ID GO_ID = FUNC_DICT[function] global all_functions all_functions = get_go_set(go, GO_ID) pred_df = pd.read_pickle(DATA_ROOT + 'model_preds_' + function + '.pkl') # FFPred preds preds_dict = {} # files = os.listdir('data/ffpred/') # for fl in files: # with open('data/gofdr/predictions.tab') as f: # for line in f: # it = line.strip().split('\t') # target_id = it[0] # if function[1].upper() != it[2]: # continue # if target_id not in preds_dict: # preds_dict[target_id] = list() # preds_dict[target_id].append((it[1], float(it[3]))) # print(len(preds_dict)) target_ids = list() predictions = list() for key, val in preds_dict.items(): target_ids.append(key) predictions.append(val) # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions}) targets = dict() with open('data/cafa3/CAFA3_benchmark20170605/groundtruth/leafonly_' + function.upper() + 'O_unique.txt') as f: for line in f: it = line.strip().split('\t') target = it[0] go_id = it[1] if target not in targets: targets[target] = list() targets[target].append(go_id) target_ids = list() labels = list() go_ids = list() for target, gos in targets.items(): go_set = set() for go_id in gos: if go_id in all_functions: go_set |= get_anchestors(go, go_id) label = np.zeros((len(functions), ), dtype=np.int32) for go_id in go_set: if go_id in func_index: label[func_index[go_id]] = 1 target_ids.append(target) go_ids.append(go_set) labels.append(label) df = pd.DataFrame({'targets': target_ids, 'gos': go_ids, 'labels': labels}) df = pd.merge(df, pred_df, on='targets', how='inner') df.to_pickle(DATA_ROOT + 'model_preds_filtered_' + function + '.pkl') def reshape(values): values = np.hstack(values).reshape(len(values), len(values[0])) return values preds = reshape(df['predictions'].values) labels = reshape(df['labels'].values) # preds = df['predictions'].values gos = df['gos'].values f, p, r, t, preds_max = compute_performance(preds, labels, gos) print(f, p, r) # labels = list() # scores = list() # for i in range(len(preds)): # all_gos = set() # for go_id in gos[i]: # if go_id in all_functions: # all_gos |= get_anchestors(go, go_id) # all_gos.discard(GO_ID) # scores_dict = {} # for val in preds[i]: # go_id, score = val # if go_id in all_functions: # go_set = get_anchestors(go, go_id) # for g_id in go_set: # if g_id not in scores_dict or scores_dict[g_id] < score: # scores_dict[g_id] = score # all_preds = set(scores_dict) # | all_gos # all_preds.discard(GO_ID) # for go_id in all_preds: # if go_id in scores_dict: # scores.append(scores_dict[go_id]) # else: # scores.append(0) # if go_id in all_gos: # labels.append(1) # else: # labels.append(0) # scores = np.array(scores) # labels = np.array(labels) roc_auc = compute_roc(preds, labels) print(roc_auc) # preds_max = (scores > t).astype(np.int32) mcc = compute_mcc(preds_max, labels) print(mcc)