Exemple #1
0
def load_data():
    df = get_data()
    functions = list()
    phenotypes = list()
    n = len(df)
    train_n = int(0.8 * n)
    index = np.arange(n)
    np.random.seed(seed=0)
    np.random.shuffle(index)
    train_df = df.loc[index[:train_n]]
    test_df = df.loc[index[train_n:]]
    test_df.to_pickle('data/test_data.pkl')
    for i, row in train_df.iterrows():
        funcs = set()
        phenos = set()
        for func in row['functions']:
            funcs |= get_anchestors(go, func)
        for pheno in row['phenotypes']:
            phenos |= get_anchestors(hp, pheno)
        phenos.discard('HP:0000001')
        funcs.discard(MOLECULAR_FUNCTION)
        funcs.discard(BIOLOGICAL_PROCESS)
        funcs.discard(CELLULAR_COMPONENT)
        functions.append(funcs)
        phenotypes.append(phenos)
    return functions, phenotypes
Exemple #2
0
def compute_performance(func):
    go = get_gene_ontology()
    train_df = pd.read_pickle('data/swissexp/train-' + func + '.pkl')
    test_df = pd.read_pickle('data/swissexp/test-' + func + '.pkl')

    train_labels = {}
    test_labels = {}
    for i, row in train_df.iterrows():
        go_set = set()
        for go_id in row['gos']:
            if go_id in go:
                go_set |= get_anchestors(go, go_id)
        train_labels[row['proteins']] = row['labels']

    for i, row in test_df.iterrows():
        go_set = set()
        for go_id in row['gos']:
            if go_id in go:
                go_set |= get_anchestors(go, go_id)
        test_labels[row['proteins']] = row['labels']

    preds = list()
    test = list()
    with open('data/swissexp/blast-' + func + '.res') as f:
        for line in f:
            it = line.strip().split('\t')
            preds.append(train_labels[it[1]])
            test.append(test_labels[it[0]])

    total = 0
    p = 0.0
    r = 0.0
    f = 0.0
    p_total = 0
    for label, pred in zip(test, preds):
        tp = np.sum(label * pred)
        fp = np.sum(pred) - tp
        fn = np.sum(label) - tp
        # tp = len(label.intersection(pred))
        # fp = len(pred) - tp
        # fn = len(label) - tp

        if tp == 0 and fp == 0 and fn == 0:
            continue
        total += 1
        if tp != 0:
            p_total += 1
            precision = tp / (1.0 * (tp + fp))
            recall = tp / (1.0 * (tp + fn))
            p += precision
            r += recall
    p /= p_total
    r /= total
    f = 2 * p * r / (p + r)
    return f, p, r
Exemple #3
0
def deepgo_stats():
    df = pd.read_pickle('data/bp.pkl')
    functions = set(df['functions'].values)
    n = 0
    rules = set()
    with open('data/rules_prop.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            rules.add(it[0].replace('_', ':'))

    print('Functions: ', len(functions))
    print('Rules: ', len(rules))
    inter = functions.intersection(rules)
    with open('data/overlap.txt', 'w') as f:
        for go_id in inter:
            f.write(go_id + '\n')

    print('Overlap: ', len(inter))
    go = get_ontology('data/go.obo')
    for go_id in list(inter):
        inter |= get_anchestors(go, go_id)
    print(len(inter))
    res = list()
    for func in functions:
        if func in inter:
            res.append(func)
    print(len(res))
    df = pd.DataFrame({'functions': res})
    df.to_pickle('data/phenogo.pkl')
Exemple #4
0
def load_data():
    ngram_df = pd.read_pickle(DATA_ROOT + 'ngrams.pkl')
    vocab = {}
    for key, gram in enumerate(ngram_df['ngrams']):
        vocab[gram] = key + 1
    gram_len = len(ngram_df['ngrams'][0])
    print('Gram length:', gram_len)
    print('Vocabulary size:', len(vocab))
    proteins = list()
    gos = list()
    labels = list()
    ngrams = list()
    sequences = list()
    accessions = list()
    df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl')
    # Filtering data by sequences
    index = list()
    for i, row in df.iterrows():
        if is_ok(row['sequences']):
            index.append(i)
    df = df.loc[index]

    for i, row in df.iterrows():
        go_list = []
        for item in row['annots']:
            items = item.split('|')
            if items[1] in EXP_CODES:
                go_list.append(items[0])
            # go_list.append(items[0])
        go_set = set()
        for go_id in go_list:
            if go_id in func_set:
                go_set |= get_anchestors(go, go_id)
        if not go_set or GO_ID not in go_set:
            continue
        go_set.remove(GO_ID)
        gos.append(go_list)
        proteins.append(row['proteins'])
        accessions.append(row['accessions'])
        seq = row['sequences']
        sequences.append(seq)
        grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
        for i in xrange(len(seq) - gram_len + 1):
            grams[i] = vocab[seq[i:(i + gram_len)]]
        ngrams.append(grams)
        label = np.zeros((len(functions), ), dtype='int32')
        for go_id in go_set:
            if go_id in go_indexes:
                label[go_indexes[go_id]] = 1
        labels.append(label)
    res_df = pd.DataFrame({
        'accessions': accessions,
        'proteins': proteins,
        'ngrams': ngrams,
        'labels': labels,
        'gos': gos,
        'sequences': sequences
    })
    print(len(res_df))
    return res_df
Exemple #5
0
def get_functions(annot_num):
    df = pd.read_pickle(DATA_ROOT + 'miR2GO_nonIEA_GOA_20180617.pkl')
    annots = dict()
    for i, row in df.iterrows():
        go_set = set()
        #if not is_ok(row['sequences']):                 #if no sequences, then continue
        #   continue
        for go_id in row['gos']:  #add labels
            #go_id = go_id.split('|')
            #if go_id[1] not in EXP_CODES:
            #    continue
            #go_id = go_id[0]
            if go_id in func_set:
                go_set |= get_anchestors(go, go_id)
        for go_id in go_set:
            if go_id not in annots:
                annots[go_id] = 0
            annots[go_id] += 1
    filtered = list()
    for go_id in functions:
        if go_id in annots and annots[go_id] >= annot_num:
            filtered.append(go_id)
    print len(filtered)
    df = pd.DataFrame({'functions': filtered})
    df.to_pickle(TEST_DATA_ROOT + FUNCTION + '.pkl')
    print 'Saved ' + TEST_DATA_ROOT + FUNCTION + '.pkl'
Exemple #6
0
def get_functions(annot_num):
    df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl')
    annots = dict()
    for i, row in df.iterrows():
        go_set = set()
        if not is_ok(row['sequences']):
            continue
        for go_id in row['annots']:
            go_id = go_id.split('|')
            if go_id[1] not in EXP_CODES:
                continue
            go_id = go_id[0]
            if go_id in func_set:
                go_set |= get_anchestors(go, go_id)
        for go_id in go_set:
            if go_id not in annots:
                annots[go_id] = 0
            annots[go_id] += 1
    filtered = list()
    for go_id in functions:
        if go_id in annots and annots[go_id] >= annot_num:
            filtered.append(go_id)
    print(len(filtered))
    df = pd.DataFrame({'functions': filtered})
    df.to_pickle(DATA_ROOT + FUNCTION + '.pkl')
    print('Saved ' + DATA_ROOT + FUNCTION + '.pkl')
Exemple #7
0
def compute_performance(preds, labels, gos):
    # preds = np.round(preds, decimals=2)
    f_max = 0
    p_max = 0
    r_max = 0
    t_max = 0
    for t in range(1, 100):
        threshold = t / 100.0
        predictions = (preds > threshold).astype(np.int32)
        # predictions = list()
        total = 0
        f = 0.0
        p = 0.0
        r = 0.0
        p_total = 0
        for i in range(preds.shape[0]):
            tp = np.sum(predictions[i, :] * labels[i, :])
            fp = np.sum(predictions[i, :]) - tp
            fn = np.sum(labels[i, :]) - tp
            all_gos = set()
            all_preds = set()
            for go_id in gos[i]:
                if go_id in all_functions:
                    all_gos |= get_anchestors(go, go_id)
            all_gos.discard(GO_ID)
            # for val in preds[i]:
            #     go_id, score = val
            #     if score > threshold and go_id in all_functions:
            #         all_preds |= get_anchestors(go, go_id)
            # all_preds.discard(GO_ID)
            # predictions.append(all_preds)
            # tp = len(all_gos.intersection(all_preds))
            # fp = len(all_preds) - tp
            # fn = len(all_gos) - tp
            all_gos -= func_set
            fn += len(all_gos)

            if tp == 0 and fp == 0 and fn == 0:
                continue
            total += 1
            if tp != 0:
                p_total += 1
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                p += precision
                r += recall
        if total > 0 and p_total > 0:
            r /= total
            p /= p_total
            if p + r > 0:
                f = 2 * p * r / (p + r)
                if f_max < f:
                    f_max = f
                    p_max = p
                    r_max = r
                    t_max = threshold
                    predictions_max = predictions

    return f_max, p_max, r_max, t_max, predictions_max
Exemple #8
0
def performanc_by_interpro():
    pred_df = pd.read_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl')
    ipro_df = load_prot_ipro()
    df = pred_df.merge(ipro_df, on='proteins', how='left')
    ipro = get_ipro()

    def reshape(values):
        values = np.hstack(values).reshape(
            len(values), len(values[0]))
        return values

    for ipro_id in ipro:
        if len(ipro[ipro_id]['parents']) > 0:
            continue
        labels = list()
        predictions = list()
        gos = list()
        for i, row in df.iterrows():
            if not isinstance(row['ipros'], list):
                continue
            if ipro_id in row['ipros']:
                labels.append(row['labels'])
                predictions.append(row['predictions'])
                gos.append(row['gos'])
        pr = 0
        rc = 0
        total = 0
        p_total = 0
        for i in range(len(labels)):
            tp = np.sum(labels[i] * predictions[i])
            fp = np.sum(predictions[i]) - tp
            fn = np.sum(labels[i]) - tp
            all_gos = set()
            for go_id in gos[i]:
                if go_id in all_functions:
                    all_gos |= get_anchestors(go, go_id)
            all_gos.discard(GO_ID)
            all_gos -= func_set
            fn += len(all_gos)
            if tp == 0 and fp == 0 and fn == 0:
                continue
            total += 1
            if tp != 0:
                p_total += 1
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                pr += precision
                rc += recall
        if total > 0 and p_total > 0:
            rc /= total
            pr /= p_total
            if pr + rc > 0:
                f = 2 * pr * rc / (pr + rc)
                logging.info('%s\t%d\t%f\t%f\t%f' % (
                    ipro_id, len(labels), f, pr, rc))
Exemple #9
0
def get_real_annotations():
    go = get_gene_ontology()
    df = pd.read_pickle('data/cafa3/swissprot_exp.pkl')
    annots = {}
    for i, row in df.iterrows():
        go_set = set()
        for go_id in row['annots']:
            go_id = go_id.split('|')
            if go_id[0] in go and go_id[1] in EXP_CODES:
                go_set |= get_anchestors(go, go_id[0])
        annots[row['proteins']] = go_set
    return annots
Exemple #10
0
def model(model_name):
    # set parameters:
    batch_size = 128
    nb_classes = len(functions)
    start_time = time.time()
    logging.info("Loading Data")
    data, targets = load_data()
    data_generator = DataGenerator(batch_size, nb_classes)
    data_generator.fit(data, None)

    logging.info("Data loaded in %d sec" % (time.time() - start_time))
    logging.info("Data size: %d" % len(data[0]))
    logging.info('Loading the model')
    with open(DATA_ROOT + model_name + '_' + FUNCTION + '.json', 'r') as f:
        json_string = next(f)
    model = model_from_json(json_string)

    optimizer = RMSprop()
    model.compile(optimizer=optimizer, loss='binary_crossentropy')

    model_path = DATA_ROOT + model_name + '_weights_' + FUNCTION + '.pkl'
    logging.info('Compilation finished in %d sec' % (time.time() - start_time))
    logging.info('Loading weights')
    load_model_weights(model, model_path)

    logging.info('Predicting')
    preds = model.predict_generator(data_generator,
                                    val_samples=len(data[0]),
                                    nb_worker=12)
    for i in xrange(len(preds)):
        preds[i] = preds[i].reshape(-1, 1)
    preds = np.concatenate(preds, axis=1)

    incon = 0
    for i in xrange(len(data)):
        for j in xrange(len(functions)):
            anchestors = get_anchestors(go, functions[j])
            for p_id in anchestors:
                if (p_id not in [GO_ID, functions[j]]
                        and preds[i, go_indexes[p_id]] < preds[i, j]):
                    incon += 1
                    preds[i, go_indexes[p_id]] = preds[i, j]
    logging.info('Inconsistent predictions: %d' % incon)

    predictions = list()
    for i in xrange(len(targets)):
        predictions.append(preds[i])
    df = pd.DataFrame({'targets': targets, 'predictions': predictions})
    print(len(df))
    df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl')
    logging.info('Done in %d sec' % (time.time() - start_time))
Exemple #11
0
def compute_performance(preds, labels, gos):
    #fw = open(TEST_DATA_ROOT + 'pred_miR-' + FUNCTION + '-line_emb_s100_n10_512-p_r.txt','a+')
    preds = np.round(preds, 2)
    f_max = 0
    p_max = 0
    r_max = 0
    t_max = 0
    for t in xrange(1, 100):
        threshold = t / 100.0
        predictions = (preds > threshold).astype(np.int32)
        total = 0
        f = 0.0
        p = 0.0
        r = 0.0
        p_total = 0
        for i in range(labels.shape[0]):
            tp = np.sum(predictions[i, :] * labels[i, :])
            fp = np.sum(predictions[i, :]) - tp
            fn = np.sum(labels[i, :]) - tp
            all_gos = set()
            for go_id in gos[i]:
                if go_id in all_functions:
                    all_gos |= get_anchestors(go, go_id)
            all_gos.discard(GO_ID)
            all_gos -= func_set
            fn += len(all_gos)
            if tp == 0 and fp == 0 and fn == 0:
                continue
            total += 1
            if tp != 0:
                p_total += 1
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                p += precision
                r += recall
        if p_total == 0:
            continue
        r /= total
        p /= p_total
        #fw.write(str(p) + '\t' + str(r) + '\t' + str(p_total) + '\t' + str(total) + '\n')
        if p + r > 0:
            f = 2 * p * r / (p + r)
            if f_max < f:
                f_max = f
                p_max = p
                r_max = r
                t_max = threshold
                predictions_max = predictions
    #fw.close()
    return f_max, p_max, r_max, t_max, predictions_max
Exemple #12
0
def compute_similarity_performance(train_df, test_df, preds):
    logging.info("Computing similarity performance")
    logging.info("Training data size %d" % len(train_df))
    train_labels = train_df['labels'].values
    train_gos = train_df['gos'].values
    global labels_gos
    labels_gos = list(zip(train_labels, train_gos))
    p = Pool(64)
    pred_gos = p.map(get_gos, preds)
    total = 0
    p = 0.0
    r = 0.0
    f = 0.0
    test_gos = test_df['gos'].values
    for gos, tgos in zip(pred_gos, test_gos):
        preds = set()
        test = set()
        for go_id in gos:
            if go_id in all_functions:
                preds |= get_anchestors(go, go_id)
        for go_id in tgos:
            if go_id in all_functions:
                test |= get_anchestors(go, go_id)
        tp = len(preds.intersection(test))
        fp = len(preds - test)
        fn = len(test - preds)
        if tp == 0 and fp == 0 and fn == 0:
            continue
        total += 1
        if tp != 0:
            precision = tp / (1.0 * (tp + fp))
            recall = tp / (1.0 * (tp + fn))
            p += precision
            r += recall
            f += 2 * precision * recall / (precision + recall)
    return f / total, p / total, r / total
Exemple #13
0
def compute_performance(preds, labels, gos):
    preds = np.round(preds, 2)
    f_max = 0
    p_max = 0
    r_max = 0
    t_max = 0
    for t in range(1, 100):
        threshold = t / 100.0
        predictions = (preds > threshold).astype(np.int32)
        total = 0
        f = 0.0
        p = 0.0
        r = 0.0
        p_total = 0
        for i in range(labels.shape[0]):
            tp = np.sum(predictions[i, :] * labels[i, :])
            fp = np.sum(predictions[i, :]) - tp
            fn = np.sum(labels[i, :]) - tp
            all_gos = set()
            for go_id in gos[i]:
                if go_id in all_functions:
                    all_gos |= get_anchestors(go, go_id)
            for g_id in GO_IDS:
                all_gos.discard(g_id)
            all_gos -= func_set
            fn += len(all_gos)
            if tp == 0 and fp == 0 and fn == 0:
                continue
            total += 1
            if tp != 0:
                p_total += 1
                precision = tp / (1.0 * (tp + fp))
                recall = tp / (1.0 * (tp + fn))
                p += precision
                r += recall
        if p_total == 0:
            continue
        r /= total
        p /= p_total
        if p + r > 0:
            f = 2 * p * r / (p + r)
            if f_max < f:
                f_max = f
                p_max = p
                r_max = r
                t_max = threshold
                predictions_max = predictions
    return f_max, p_max, r_max, t_max, predictions_max
Exemple #14
0
def specific_predictions():
    root = 'data/cafa3/'
    go = get_gene_ontology()
    fw = open(root + 'test_predictions_specific.tab', 'w')
    with open(root + 'test_predictions.tab', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            go_set = set(items[1:])
            gos = go_set.copy()
            for go_id in gos:
                anchestors = get_anchestors(go, go_id)
                anchestors.remove(go_id)
                go_set -= anchestors
            fw.write(items[0])
            for go_id in go_set:
                fw.write('\t' + go_id)
            fw.write('\n')
    fw.close()
Exemple #15
0
def load_scores():
    scores = dict()
    with open('data/cosine.out') as f:
        for line in f:
            it = line.strip().split()
            prot = it[0].strip('()\',')
            go_id = it[1].strip('()\',').upper()
            score = float(it[2].strip('()\','))
            if prot not in scores:
                scores[prot] = {}
            if go_id in go:
                gos = get_anchestors(go, go_id)
                gos.add(go_id)
                for g_id in gos:
                    if g_id not in scores[prot]:
                        scores[prot][g_id] = score
                    else:
                        scores[prot][g_id] = max(scores[prot][g_id], score)
    return scores
Exemple #16
0
def load_annotations():
    mapping = load_mapping()
    annots = dict()
    with open('data/goa_human.gaf') as f:
        for line in f:
            if line.startswith('!'):
                continue
            it = line.strip().split('\t')
            ac = it[1]
            if it[3] == 'NOT' or it[6] not in EXP_CODES:
                continue
            go_id = it[4]
            if ac not in mapping:
                continue
            prot = mapping[ac]
            if prot not in annots:
                annots[prot] = set()
            if go_id in go:
                annots[prot].add(go_id)
                annots[prot] |= get_anchestors(go, go_id)
    return annots
Exemple #17
0
def compute_performance():
    root = 'data/cafa3/'
    preds = {}
    annots = {}
    go = get_gene_ontology()
    with open(root + 'test_predictions.tab', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            preds[items[0]] = set(items[1:])
    with open(root + 'test_annotations.tab', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            annots[items[0]] = set()
            for go_id in items[1:]:
                if go_id in go:
                    annots[items[0]] |= get_anchestors(go, go_id)

    total = 0
    p = 0.0
    r = 0.0
    f = 0.0
    for prot, pred_annots in preds.iteritems():
        real_annots = annots[prot]
        if len(real_annots) == 0:
            continue
        tp = len(real_annots.intersection(pred_annots))
        fp = len(pred_annots - real_annots)
        fn = len(real_annots - pred_annots)
        if tp == 0 and fp == 0 and fn == 0:
            continue
        total += 1
        if tp != 0:
            precision = tp / (1.0 * (tp + fp))
            recall = tp / (1.0 * (tp + fn))
            p += precision
            r += recall
            f += 2 * precision * recall / (precision + recall)
    print(f / total, p / total, r / total)
Exemple #18
0
def run():
    functions, phenotypes = load_data()
    terms = list()
    n = len(functions)
    global counter
    counter = Counter()
    global tree
    tree = dict()
    e = 100
    term_index = dict()
    term_list = list()
    for go_id in go:
        term_index[go_id] = len(term_index)
        term_list.append(go_id)
    for hp_id in hp:
        term_index[hp_id] = len(term_index)
        term_list.append(hp_id)
    for i in xrange(n):
        funcs = set(map(lambda x: term_index[x], functions[i]))
        phenos = set(map(lambda x: term_index[x], phenotypes[i]))
        terms.append(funcs | phenos)
        for func in funcs:
            for pheno in phenos:
                counter[frozenset([func, pheno])] += 1
    for s, c in counter.items():
        if c < e:
            del counter[s]
    for s, c in counter.items():
        for term in s:
            if term_list[term] in go:
                tree[term] = set(
                    map(lambda x: term_index[x],
                        get_anchestors(go, term_list[term])))
                tree[term] |= set(
                    map(lambda x: term_index[x],
                        get_subset(go, term_list[term])))
            else:
                tree[term] = set(
                    map(lambda x: term_index[x],
                        get_anchestors(hp, term_list[term])))
                tree[term] |= set(
                    map(lambda x: term_index[x],
                        get_subset(hp, term_list[term])))

    print(len(counter))
    pool = Pool(48)
    gf = gzip.open('data/results.gz', 'w')
    while len(counter) > 0:
        cnts = pool.map(next_level, terms)
        cnt = sum(cnts)
        print(counter.most_common(10))
        print(cnt.most_common(10))
        for s, c in cnt.items():
            if c < e:
                del cnt[s]
            else:
                gf.write(c)
                for term in s:
                    gf.write('\t' + term_list[term])
                gf.write('\n')
        counter = cnt
Exemple #19
0
def get_predictions():
    root = 'data/cafa3/'
    annots = {}
    preds = {}
    go = get_gene_ontology()
    mf = pd.read_pickle(root + 'mf.pkl')
    mf_df = pd.read_pickle(root + 'test-mf-preds.pkl')
    functions = mf['functions']
    for i, row in mf_df.iterrows():
        prot_id = row['proteins']
        if prot_id not in preds:
            preds[prot_id] = set()
        for i in xrange(len(functions)):
            if row['predictions'][i] == 1:
                preds[prot_id].add(functions[i])
        if prot_id not in annots:
            annots[prot_id] = row['gos']

    cc = pd.read_pickle(root + 'cc.pkl')
    cc_df = pd.read_pickle(root + 'test-cc-preds.pkl')
    functions = cc['functions']
    for i, row in cc_df.iterrows():
        prot_id = row['proteins']
        if prot_id not in preds:
            preds[prot_id] = set()
        for i in xrange(len(functions)):
            if row['predictions'][i] == 1:
                preds[prot_id].add(functions[i])
        if prot_id not in annots:
            annots[prot_id] = row['gos']

    bp = pd.read_pickle(root + 'bp.pkl')
    bp_df = pd.read_pickle(root + 'test-bp-preds.pkl')
    functions = bp['functions']
    for i, row in bp_df.iterrows():
        prot_id = row['proteins']
        if prot_id not in preds:
            preds[prot_id] = set()
        for i in xrange(len(functions)):
            if row['predictions'][i] == 1:
                preds[prot_id].add(functions[i])
        if prot_id not in annots:
            annots[prot_id] = row['gos']

    # Removing parent classes
    for prot_id in preds:
        go_set = preds[prot_id]
        gos = go_set.copy()
        for go_id in gos:
            anchestors = get_anchestors(go, go_id)
            anchestors.remove(go_id)
            go_set -= anchestors

    proteins = sorted(annots.keys(),
                      key=lambda x: (x.split('_')[1], x.split('_')[0]))
    with open(root + 'test_predictions.tab', 'w') as f:
        for prot_id in proteins:
            f.write(prot_id)
            for go_id in preds[prot_id]:
                f.write('\t' + go_id)
            f.write('\n')

    with open(root + 'test_annotations.tab', 'w') as f:
        for prot_id in proteins:
            f.write(prot_id)
            for go_id in annots[prot_id]:
                if go_id in go:
                    f.write('\t' + go_id)
            f.write('\n')
Exemple #20
0
def main(function):
    global go
    go = get_gene_ontology()
    func_df = pd.read_pickle(DATA_ROOT + function + '.pkl')
    global functions
    functions = func_df['functions'].values
    func_index = dict()
    for i, go_id in enumerate(functions):
        func_index[go_id] = i
    global func_set
    func_set = set(func_index)
    global GO_ID
    GO_ID = FUNC_DICT[function]
    global all_functions
    all_functions = get_go_set(go, GO_ID)
    pred_df = pd.read_pickle(DATA_ROOT + 'model_preds_' + function + '.pkl')
    # FFPred preds
    preds_dict = {}
    # files = os.listdir('data/ffpred/')
    # for fl in files:
    # with open('data/gofdr/predictions.tab') as f:
    #     for line in f:
    #         it = line.strip().split('\t')
    #         target_id = it[0]
    #         if function[1].upper() != it[2]:
    #             continue
    #         if target_id not in preds_dict:
    #             preds_dict[target_id] = list()
    #         preds_dict[target_id].append((it[1], float(it[3])))
    # print(len(preds_dict))
    target_ids = list()
    predictions = list()
    for key, val in preds_dict.items():
        target_ids.append(key)
        predictions.append(val)
    # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions})

    targets = dict()
    with open('data/cafa3/CAFA3_benchmark20170605/groundtruth/leafonly_' +
              function.upper() + 'O_unique.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            target = it[0]
            go_id = it[1]
            if target not in targets:
                targets[target] = list()
            targets[target].append(go_id)
    target_ids = list()
    labels = list()
    go_ids = list()
    for target, gos in targets.items():
        go_set = set()
        for go_id in gos:
            if go_id in all_functions:
                go_set |= get_anchestors(go, go_id)
        label = np.zeros((len(functions), ), dtype=np.int32)
        for go_id in go_set:
            if go_id in func_index:
                label[func_index[go_id]] = 1
        target_ids.append(target)
        go_ids.append(go_set)
        labels.append(label)
    df = pd.DataFrame({'targets': target_ids, 'gos': go_ids, 'labels': labels})
    df = pd.merge(df, pred_df, on='targets', how='inner')
    df.to_pickle(DATA_ROOT + 'model_preds_filtered_' + function + '.pkl')

    def reshape(values):
        values = np.hstack(values).reshape(len(values), len(values[0]))
        return values

    preds = reshape(df['predictions'].values)
    labels = reshape(df['labels'].values)
    # preds = df['predictions'].values
    gos = df['gos'].values
    f, p, r, t, preds_max = compute_performance(preds, labels, gos)
    print(f, p, r)
    # labels = list()
    # scores = list()
    # for i in range(len(preds)):
    #     all_gos = set()
    #     for go_id in gos[i]:
    #         if go_id in all_functions:
    #             all_gos |= get_anchestors(go, go_id)
    #     all_gos.discard(GO_ID)
    #     scores_dict = {}
    #     for val in preds[i]:
    #         go_id, score = val
    #         if go_id in all_functions:
    #             go_set = get_anchestors(go, go_id)
    #             for g_id in go_set:
    #                 if g_id not in scores_dict or scores_dict[g_id] < score:
    #                     scores_dict[g_id] = score
    #     all_preds = set(scores_dict) # | all_gos
    #     all_preds.discard(GO_ID)
    #     for go_id in all_preds:
    #         if go_id in scores_dict:
    #             scores.append(scores_dict[go_id])
    #         else:
    #             scores.append(0)
    #         if go_id in all_gos:
    #             labels.append(1)
    #         else:
    #             labels.append(0)

    # scores = np.array(scores)
    # labels = np.array(labels)
    roc_auc = compute_roc(preds, labels)
    print(roc_auc)
    # preds_max = (scores > t).astype(np.int32)
    mcc = compute_mcc(preds_max, labels)
    print(mcc)