Esempio n. 1
0
def main(go_file, uniprot_file, filter_exp, prop_annots, cafa_targets,
         out_file):
    go = Ontology(go_file, with_rels=True)

    proteins, accessions, sequences, annotations, interpros, orgs = load_data(
        uniprot_file)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs
    })

    if filter_exp:
        logging.info('Filtering proteins with experimental annotations')
        index = []
        annotations = []
        for i, row in enumerate(df.itertuples()):
            annots = []
            for annot in row.annotations:
                go_id, code = annot.split('|')
                if is_exp_code(code):
                    annots.append(go_id)
            # Ignore proteins without experimental annotations
            if len(annots) == 0:
                continue
            index.append(i)
            annotations.append(annots)
        df = df.iloc[index]
        df = df.reset_index()
        df['annotations'] = annotations

    if cafa_targets:
        logging.info('Filtering cafa target proteins')
        index = []
        for i, row in enumerate(df.itertuples()):
            if is_cafa_target(row.orgs):
                index.append(i)
        df = df.iloc[index]
        df = df.reset_index()

    if prop_annots:
        prop_annotations = []
        for i, row in df.iterrows():
            # Propagate annotations
            annot_set = set()
            annots = row['annotations']
            for go_id in annots:
                go_id = go_id.split('|')[0]  # In case if it has code
                annot_set |= go.get_anchestors(go_id)
            annots = list(annot_set)
            prop_annotations.append(annots)
        df['annotations'] = prop_annotations

    df.to_pickle(out_file)
    logging.info('Successfully saved %d proteins' % (len(df), ))
Esempio n. 2
0
def main(go_file, swissprot_file, out_file):
    go = Ontology(go_file, with_rels=True)

    if swissprot_file.endswith("gz"):
        proteins, accessions, sequences, annotations, interpros, orgs = load_data_gzip(
            swissprot_file)
    else:
        proteins, accessions, sequences, annotations, interpros, orgs = load_data(
            swissprot_file)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs
    })

    logging.info('Filtering proteins with experimental annotations')
    index = []
    annotations = []
    for i, row in enumerate(df.itertuples()):
        annots = []
        for annot in row.annotations:
            go_id, code = annot.split('|')
            if is_exp_code(code):
                annots.append(go_id)
        # Ignore proteins without experimental annotations
        if len(annots) == 0:
            continue
        index.append(i)
        annotations.append(annots)
    df = df.iloc[index]
    df = df.reset_index()
    df['exp_annotations'] = annotations

    prop_annotations = []
    for i, row in df.iterrows():
        # Propagate annotations
        annot_set = set()
        annots = row['exp_annotations']
        for go_id in annots:
            annot_set |= go.get_anchestors(go_id)
        annots = list(annot_set)
        prop_annotations.append(annots)
    df['prop_annotations'] = prop_annotations

    cafa_target = []
    for i, row in enumerate(df.itertuples()):
        if is_cafa_target(row.orgs):
            cafa_target.append(True)
        else:
            cafa_target.append(False)
    df['cafa_target'] = cafa_target

    df.to_pickle(out_file)
    logging.info('Successfully saved %d proteins' % (len(df), ))
def main(go_file, out_file):
    go = Ontology(go_file, with_rels=False)

    cc = get_top_classes(go, FUNC_DICT['cc'])
    mf = get_top_classes(go, FUNC_DICT['mf'])
    bp = get_top_classes(go, FUNC_DICT['bp'])

    cc = list(
        map(lambda x: f'<http://purl.obolibrary.org/obo/{x.replace(":","_")}>',
            cc))
    mf = list(
        map(lambda x: f'<http://purl.obolibrary.org/obo/{x.replace(":","_")}>',
            mf))
    bp = list(
        map(lambda x: f'<http://purl.obolibrary.org/obo/{x.replace(":","_")}>',
            bp))
    f = open(out_file, 'w')
    for id1 in cc:
        for id2 in mf:
            for id3 in bp:
                f.write(id1 + '\t' + id2 + '\n')
                f.write(id2 + '\t' + id1 + '\n')
                f.write(id2 + '\t' + id3 + '\n')
                f.write(id3 + '\t' + id2 + '\n')
                f.write(id1 + '\t' + id3 + '\n')
                f.write(id3 + '\t' + id1 + '\n')
    f.close()
Esempio n. 4
0
def main(in_file, hp_file, terms_file, out_file, map_file):
    # Load GO and read list of all terms
    hp = Ontology(hp_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    df = pd.read_pickle(in_file)
    mapping = {}
    with open(map_file) as f:
        for line in f:
            it = line.strip().split()
            mapping[it[1]] = it[0]

    w = open(out_file, 'w')
    # w.write('AUTHOR Hoehndorf Lab - DeepGO team\n')
    # w.write('MODEL 1\n')
    # w.write('KEYWORDS machine learning, sequence alignment.\n')
    for row in df.itertuples():
        prot_id = row.genes
        # if prot_id not in mapping:
        #     continue
        # prot_id = mapping[prot_id]
        for i, score in enumerate(row.preds):
            if score >= 0.1:
                w.write(prot_id + '\t' + terms[i] + '\t%.2f\n' % score)
    # w.write('END\n')
    w.close()
def main(go_file, uniprot_file, filter_exp, prop_annots, out_file):
    go = Ontology(go_file, with_rels=True)

    proteins, accessions, sequences, annotations, interpros, orgs, genes, gene_names = load_data(uniprot_file)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs,
        'genes': genes,
        'gene_names': gene_names
    })

    # Filter proteins
    df = df[df['orgs'] == '9606']
    
    logging.info('Filtering proteins with experimental annotations')
    index = []
    annotations = []
    iea_annotations = []
    for i, row in enumerate(df.itertuples()):
        annots = set()
        iea_annots = set()
        for annot in row.annotations:
            go_id, code = annot.split('|')
            anch_set = go.get_anchestors(go_id)
            if is_exp_code(code):
                annots |= anch_set
            iea_annots |= anch_set
        annots = list(annots)
        iea_annots = list(iea_annots)
        annotations.append(annots)
        iea_annotations.append(iea_annots)
    df['exp_annotations'] = annotations
    df['iea_annotations'] = iea_annotations

    
    df.to_pickle(out_file)
    logging.info('Successfully saved %d proteins' % (len(df),) )
Esempio n. 6
0
def main(go_file, old_data_file, new_data_file, out_terms_file,
         train_data_file, test_data_file, min_count):
    go = Ontology(go_file, with_rels=True)
    logging.info('GO loaded')

    df = pd.read_pickle(old_data_file)

    logging.info('Processing annotations')

    cnt = Counter()
    annotations = list()
    for i, row in df.iterrows():
        for term in row['annotations']:
            cnt[term] += 1

    train_prots = set()
    for row in df.itertuples():
        p_id = row.proteins
        train_prots.add(p_id)

    df.to_pickle(train_data_file)

    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)
    terms = []
    for key, val in res.items():
        print(key, len(val))
        terms += val

    logging.info(f'Number of terms {len(terms)}')

    # Save the list of terms
    df = pd.DataFrame({'terms': terms})
    df.to_pickle(out_terms_file)

    # Save testing data
    df = pd.read_pickle(new_data_file)

    index = []
    for i, row in enumerate(df.itertuples()):
        p_id = row.proteins
        if p_id not in train_prots:
            index.append(i)
    df = df.iloc[index]
    print('Number of test proteins', len(df))
    df.to_pickle(test_data_file)
Esempio n. 7
0
def main(go_file, terms_file, train_data_file, test_data_file, ont):
    go = Ontology(go_file, with_rels=True)
    logging.info('GO loaded')

    go_set = go.get_namespace_terms(NAMESPACES[ont])
    terms_df = pd.read_pickle(terms_file)
    tcnt = 0
    print('Total terms', len(terms_df))

    for go_id in terms_df['terms']:
        if go_id in go_set:
            tcnt += 1
    trdf = pd.read_pickle(train_data_file)
    print('Total train', len(trdf))
    cnt = 0
    for i, row in trdf.iterrows():
        ok = False
        for go_id in row['annotations']:
            if go_id in go_set:
                ok = True
                break
        if ok:
            cnt += 1
    print('Number of training proteins', cnt)

    tsdf = pd.read_pickle(test_data_file)
    print('Total test', len(tsdf))
    cnt = 0
    for i, row in tsdf.iterrows():
        ok = False
        for go_id in row['annotations']:
            if go_id in go_set:
                ok = True
                break
        if ok:
            cnt += 1
    print('Number of testing proteins', cnt)
    print('Number of terms', tcnt)
Esempio n. 8
0
def main(go_file, data_file, out_terms_file, train_data_file, test_data_file,
         min_count):
    go = Ontology(go_file, with_rels=True)
    logging.info('GO loaded')

    df = pd.read_pickle(data_file)
    print("DATA FILE", len(df))

    logging.info('Processing annotations')

    cnt = Counter()
    annotations = list()
    for i, row in df.iterrows():
        for term in row['prop_annotations']:
            cnt[term] += 1

    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)
    terms = []
    for key, val in res.items():
        print(key, len(val))
        terms += val

    logging.info(f'Number of terms {len(terms)}')

    # Save the list of terms
    terms_df = pd.DataFrame({'terms': terms})
    terms_df.to_pickle(out_terms_file)

    n = len(df)
    # Split train/valid
    index = np.arange(n)
    train_n = int(n * 0.95)
    np.random.seed(seed=0)
    np.random.shuffle(index)
    train_df = df.iloc[index[:train_n]]
    test_df = df.iloc[index[train_n:]]

    print('Number of train proteins', len(train_df))
    train_df.to_pickle(train_data_file)

    print('Number of test proteins', len(test_df))
    test_df.to_pickle(test_data_file)
def main(go_file, hp_file, terms_file, preds_file, gene):
    go = Ontology(go_file, with_rels=True)
    print('GO loaded')
    hp = Ontology(hp_file, with_rels=True)
    print('HP loaded')

    terms_df = pd.read_pickle(terms_file)
    global terms
    terms = terms_df['terms'].values.flatten()
    labels = terms_df['labels'].values.flatten()
    print('Phenotypes', len(terms))
    global term_set
    term_set = set(terms)
    terms_dict = {v: i for i, v in enumerate(terms)}

    df = pd.read_pickle(preds_file)
    row = df.loc[df['genes'] == gene]

    with open(f'data/{gene}.deepgo_annotations.txt', 'w') as f:
        dg = [x.split('|') for x in row['deepgo_annotations'].values[0]]
        dg = sorted(dg, key=lambda x: float(x[1]), reverse=True)
        for go_id, score in dg:
            name = go.get_term(go_id)['name']
            f.write(f'{go_id}\t{name}\t{score}\n')

    with open(f'data/{gene}.go_annotations.txt', 'w') as f:
        dg = [x for x in row['go_annotations'].values[0]]
        for go_id in dg:
            name = go.get_term(go_id)['name']
            f.write(f'{go_id}\t{name}\n')

    with open(f'data/{gene}.deeppheno_annotations.txt', 'w') as f:
        dp = [(terms[i], score)
              for i, score in enumerate(row['preds'].values[0])]
        dp = sorted(dp, key=lambda x: x[1], reverse=True)
        for hp_id, score in dp:
            name = hp.get_term(hp_id)['name']
            f.write(f'{hp_id}\t{name}\t{score}\n')
            if score < 0.01:
                break
Esempio n. 10
0
def main(train_data_file, test_data_file, diamond_scores_file, ont):

    go_rels = Ontology('data/go.obo', with_rels=True)

    train_df = pd.read_pickle(train_data_file)
    annotations = train_df['prop_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = pd.read_pickle(test_data_file)
    test_annotations = test_df['prop_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score

        blast_preds.append(annots)

    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for t in range(101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in blast_preds[i].items():
                if score >= threshold:
                    annots.add(go_id)

            new_annots = set()
            for go_id in annots:
                new_annots |= go_rels.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.3f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig('aupr.pdf')
    plt.show()
Esempio n. 11
0
def main(hp_file, data_file, terms_file, gos_file, out_file, fold, batch_size,
         epochs, load_model, logger_file, threshold, device, estimators):
    gos_df = pd.read_pickle(gos_file)
    gos = gos_df['gos'].values.flatten()
    gos_dict = {v: i for i, v in enumerate(gos)}

    # cross validation settings
    out_file = f'fold{fold}_exp-' + out_file
    params = {'n_estimators': estimators}
    print('Params:', params)
    global hpo
    hpo = Ontology(hp_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    global terms
    terms = terms_df['terms'].values.flatten()
    print('Phenotypes', len(terms))
    global term_set
    term_set = set(terms)
    train_df, valid_df, test_df = load_data(data_file, terms, fold)
    terms_dict = {v: i for i, v in enumerate(terms)}
    nb_classes = len(terms)
    params['nb_classes'] = nb_classes
    print(len(terms_dict))
    test_steps = int(math.ceil(len(test_df) / batch_size))
    test_generator = DFGenerator(test_df, gos_dict, terms_dict, len(test_df))
    valid_steps = int(math.ceil(len(valid_df) / batch_size))
    train_steps = int(math.ceil(len(train_df) / batch_size))

    xy_generator = DFGenerator(train_df, gos_dict, terms_dict, len(train_df))
    x, y = xy_generator[0]
    val_generator = DFGenerator(valid_df, gos_dict, terms_dict, len(valid_df))
    val_x, val_y = val_generator[0]
    test_x, test_y = test_generator[0]
    if load_model:
        logging.info(f'Loading RandomForest_{estimators} classifier')
        clf = load(f'data/rf_{estimators}.joblib')
    else:
        logging.info('Training RandomForest classifier')
        clf = RandomForestRegressor(n_estimators=params['n_estimators'])
        clf.fit(x, y)
        dump(clf, f'data/rf_{estimators}.joblib')

    logging.info('Evaluating model')
    val_preds = clf.predict(val_x)
    # val_accuracy = accuracy_score(val_preds, val_y)
    # print('Val accuracy', val_accuracy)

    preds = clf.predict(test_x)
    # test_accuracy = accuracy_score(preds, test_y)
    # print('Test accuracy', test_accuracy)

    all_terms_df = pd.read_pickle('data/all_terms.pkl')
    all_terms = all_terms_df['terms'].values
    all_terms_dict = {v: k for k, v in enumerate(all_terms)}
    all_labels = np.zeros((len(test_df), len(all_terms)), dtype=np.int32)
    for i, row in enumerate(test_df.itertuples()):
        for hp_id in row.hp_annotations:
            if hp_id in all_terms_dict:
                all_labels[i, all_terms_dict[hp_id]] = 1

    all_preds = np.zeros((len(test_df), len(all_terms)), dtype=np.float32)
    for i in range(len(test_df)):
        for j in range(nb_classes):
            all_preds[i, all_terms_dict[terms[j]]] = preds[i, j]
    logging.info('Computing performance:')
    roc_auc = compute_roc(all_labels, all_preds)
    print('ROC AUC: %.2f' % (roc_auc, ))
    test_df['preds'] = list(preds)
    print(test_df)
    logging.info('Saving predictions')
    test_df.to_pickle(out_file)
Esempio n. 12
0
def main(model_file, terms_file, annotations_file):

    go_rels = Ontology('data/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()

    df = pd.read_pickle(annotations_file)
    annotations = df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    go_rels.calculate_ic(annotations)

    # df = df[df['orgs'] == '559292']
    sl = 0

    annotations = df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    prot_ids = df['proteins'].values
    ids, data = get_data(df['sequences'])

    # Load CNN model
    model = load_model(model_file)

    preds = model.predict(data, batch_size=100, verbose=1)
    assert preds.shape[1] == len(terms)
    mf_set = go_rels.get_namespace_terms(NAMESPACES['mf'])
    # terms = ['GO:0008047']
    for l in range(len(terms)):
        # if terms[l] not in mf_set:
        #     continue
        deep_preds = {}
        for i, j in enumerate(ids):
            prot_id = prot_ids[j]
            if prot_id not in deep_preds:
                deep_preds[prot_id] = {}
            if preds[i, l] >= 0.01:  # Filter out very low scores
                if terms[l] not in deep_preds[prot_id]:
                    deep_preds[prot_id][terms[l]] = preds[i, l]
                else:
                    deep_preds[prot_id][terms[l]] = max(
                        deep_preds[prot_id][terms[l]], preds[i, l])

        go_set = set([terms[l]])
        # go_set.remove(FUNC_DICT['mf'])
        labels = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), annotations))
        bin_labels = list(map(lambda x: len(x), labels))
        pos_cnt = sum(bin_labels)
        fmax = 0.0
        tmax = 0.0
        smin = 1000
        for t in range(0, 100):
            threshold = t / 100.0
            predictions = []
            for i, row in enumerate(df.itertuples()):
                annots_dict = deep_preds[row.proteins] or {}

                annots = set()
                for go_id, score in annots_dict.items():
                    if score >= threshold:
                        annots.add(go_id)
                # new_annots = set()
                # for go_id in annots:
                #     new_annots |= go_rels.get_anchestors(go_id)
                predictions.append(annots)

            # Filter classes
            predictions = list(
                map(lambda x: set(filter(lambda y: y in go_set, x)),
                    predictions))

            fscore, prec, rec, s = evaluate_annotations(
                go_rels, labels, predictions)
            # print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
            if fmax < fscore:
                fmax = fscore
                tmax = threshold
            if smin > s:
                smin = s
        print(
            f'{terms[l]} {pos_cnt} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
        )
Esempio n. 13
0
def main(train_data_file, test_data_file, out_file, terms_file, root_class, fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file
    
    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)

    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc): 
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        gene_id = row.genes
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)

        preds = []
        for i, row in enumerate(test_df.itertuples()):
            preds.append(new_annots)
        
    
        # Filter classes
        
        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            pmax = prec
            rmax = rec
            max_preds = preds
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
def main(go_file, data_file, neg_data_file, cls_embeds_file, rel_embeds_file,
         margin):
    go = Ontology(go_file, with_rels=False)

    cls_df = pd.read_pickle(cls_embeds_file)
    rel_df = pd.read_pickle(rel_embeds_file)
    nb_classes = len(cls_df)
    nb_relations = len(rel_df)
    embeds_list = cls_df['embeddings'].values
    classes = {v: k for k, v in enumerate(cls_df['classes'])}
    rembeds_list = rel_df['embeddings'].values
    relations = {v: k for k, v in enumerate(rel_df['relations'])}
    size = len(embeds_list[0])
    embeds = np.zeros((nb_classes, size), dtype=np.float32)
    for i, emb in enumerate(embeds_list):
        embeds[i, :] = emb
    rs = np.abs(embeds[:, -1])
    embeds = embeds[:, :-1]

    rsize = len(rembeds_list[0])
    rembeds = np.zeros((nb_relations, rsize), dtype=np.float32)
    for i, emb in enumerate(rembeds_list):
        rembeds[i, :] = emb

    data, _, _, _ = load_data(data_file, neg_data_file, index=False)

    print(relations)
    # Evaluate normal form 1 axioms
    n = 0
    s = 0
    for c, d in data['nf1']:
        if c not in classes or d not in classes:
            continue
        n += 1
        c, d = classes[c], classes[d]
        ec = embeds[c, :]
        rc = rs[c]
        ed = embeds[d, :]
        rd = rs[d]
        if is_inside(ec, rc, ed, rd, margin):
            s += 1
    print('Normal form 1', n, s, s / n)

    # Normal form 2 axioms
    n = 0
    s = 0
    ns = 0
    for c, d, e in data['nf2']:
        if c not in classes or d not in classes or e not in classes:
            continue
        n += 1
        c, d, e = classes[c], classes[d], classes[e]
        ec = embeds[c, :]
        rc = rs[c]
        ed = embeds[d, :]
        rd = rs[d]
        ee = embeds[e, :]
        re = rs[e]
        dst = np.linalg.norm(ec - ed) - margin
        if dst <= rc + rd and (is_inside(ec, rc, ee, re, margin)
                               or is_inside(ed, rd, ee, re, margin)):
            s += 1
        elif (dst > rc and dst > rd) and dst <= rc + rd:
            x = (dst * dst - rc * rc + rd * rd) / (2 * dst)
            rx = math.sqrt(rd * rd - x * x)
            c = x / dst
            ex = ed + (ec - ed) * c
            if is_inside(ex, rx, ee, re, margin):
                s += 1
        elif dst > rc + rd:
            ns += 1

    print('Normal form 2', n, s, s / n, ns)

    # Evaluate normal form 3 axioms
    # C subclassOf R some D
    n = 0  # len(data['nf3'])
    s = 0
    for c, r, d in data['nf3']:
        if c not in classes or d not in classes or r not in relations:
            continue
        c, r, d = classes[c], relations[r], classes[d]
        if r not in [0, 1, 3, 7, 9, 10, 15]:
            continue
        n += 1
        ec = embeds[c, :]
        rc = rs[c]
        ed = embeds[d, :]
        rd = rs[d]
        er = rembeds[r, :]
        ec = ec + er
        if is_inside(ec, rc, ed, rd, margin):
            s += 1
    print('Normal form 3', n, s, s / n)

    # Evaluate normal form 4 axioms
    # R some C subclassOf D
    n = 0
    s = 0
    for r, c, d in data['nf4']:
        if c not in classes or d not in classes or r not in relations:
            continue
        n += 1
        r, c, d = relations[r], classes[c], classes[d]
        ec = embeds[c, :]
        rc = rs[c]
        ed = embeds[d, :]
        rd = rs[d]
        er = rembeds[r, :]
        ec = ec - er
        if is_intersect(ec, rc, ed, rd, margin):
            s += 1
    print('Normal form 4', n, s, s / n)

    # Disjointness axioms
    n = len(data['disjoint'])
    s = 0
    for c, d, e in data['disjoint']:
        c, d = classes[c], classes[d]
        ec = embeds[c, :]
        rc = rs[c]
        ed = embeds[d, :]
        rd = rs[d]

        if not is_intersect(ec, rc, ed, rd):
            s += 1
    print('Disjointness', n, s, s / n)

    # plot_embeddings(embeds, rs, classes)
    return
    g = {}
    for i in range(len(embeds)):
        g[i] = []
    for c, d in data['nf1']:
        g[d].append(c)

    sub_n = 1000
    labels = np.zeros((sub_n, len(embeds)), dtype=np.int8)

    print('Building labels')

    for i in range(sub_n):
        q = deque()
        for ch in g[i]:
            q.append(ch)
        while len(q) > 0:
            c = q.popleft()
            for ch in g[c]:
                q.append(ch)
            labels[i, c] = 1

    print('Running inference')
    preds = np.zeros((sub_n, len(embeds)), dtype=np.int8)
    for i in range(sub_n):
        c = embeds[i, :]
        rc = rs[i]
        dst = np.linalg.norm(embeds - c, axis=1)
        dst = dst + rs - margin
        subs = (dst <= rc).astype(np.int8)
        preds[i, :] = subs

    tp = np.sum((labels == 1) & (preds == 1))
    fp = np.sum((labels == 0) & (preds == 1))
    fn = np.sum((labels == 1) & (preds == 0))
    precision = tp / (fp + tp)
    recall = tp / (fn + tp)
    f = 2 * precision * recall / (precision + recall)
    print(f, precision, recall)
def main(train_data_file, test_data_file, terms_file, out_file, root_class,
         fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file

    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i, row in enumerate(test_df.itertuples()):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = row.preds[terms_dict[hp_id]]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            gene_id = row.genes
            annots_dict = {}

            for j, score in enumerate(row.preds):
                hp_id = terms[j]
                # score = score * (1 - alpha)
                if hp_id in annots_dict:
                    annots_dict[hp_id] += score
                else:
                    annots_dict[hp_id] = score

            annots = set()
            for hp_id, score in annots_dict.items():
                if score >= threshold:
                    annots.add(hp_id)
            new_annots = set()
            for hp_id in annots:
                new_annots |= hp.get_anchestors(hp_id)
            new_annots = new_annots.intersection(hp_set)
            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            max_preds = preds
            pmax = prec
            rmax = rec
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
Esempio n. 16
0
def main(train_data_file, test_data_file, terms_file, rules_file):

    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)

    rule_annots = {}
    with open(rules_file) as f:
        for line in f:
            it = line.strip().split()
            go_id = it[0].replace('_', ':')
            hp_id = it[1].replace('_', ':')
            if go_id not in rule_annots:
                rule_annots[go_id] = set()
            rule_annots[go_id].add(hp_id)

    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)

    hp_set_anch = set()
    for hp_id in hp_set:
        hp_set_anch |= hp.get_anchestors(hp_id)

    labels = test_annotations
    # labels = list(map(lambda x: set(filter(lambda y: y in hp_set_anch, x)), labels))

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            gene_id = row.genes
            annots = set()
            for item in row.deepgo_annotations:
                go_id, score = item.split('|')
                score = float(score)
                if score >= threshold and go_id in rule_annots:
                    annots |= rule_annots[go_id]
            new_annots = set()
            for hp_id in annots:
                new_annots |= hp.get_anchestors(hp_id)

            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            max_preds = preds
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    test_df['hp_preds'] = max_preds
    test_df.to_pickle('data/predictions_max.pkl')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
Esempio n. 17
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    # terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))

    #### ? notice that @annotations and @test_annotations are used to get IC scores, so we are not allowed to do pre-filtering
    go_rels.calculate_ic(annotations + test_annotations)

    go_set = go_rels.get_namespace_terms(
        NAMESPACES[ont])  #? consider all the MF or CC or BP

    #### ? filter terms to have only mf ?
    # terms = [t for t in terms if t in go_set]
    # print ('number of terms kept from terms_file {}'.format(len(terms)))

    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)
    ##!! let's save this
    pickle.dump(ics, open("data-cafa/ICsValueTable.pickle", "wb"))

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    ####
    # BLAST Similarity (Diamond) #! we can use same call, we have their output
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    ####
    # DeepGOPlus

    # go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) #? consider all the MF or CC or BP
    go_set.remove(FUNC_DICT[ont])
    labels = test_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)),
                      labels))  ##! filter true labels by @go_set
    print("total labels {}".format(len(go_set)))

    deep_preds = []
    # alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46}
    for i, row in enumerate(
            test_df.itertuples()):  #! read in prediction of neural net
        annots_dict = {}
        # annots_dict = blast_preds[i].copy() #! copy blast score
        # for go_id in annots_dict: # * set 0 for all @blast_prediction
        #     annots_dict[go_id] = 0 # *= alphas[go_rels.get_namespace(go_id)] #! scale down blast score.
        for j, score in enumerate(row.preds):  #! prediction of @test_df
            go_id = terms[j]
            # if go_id not in go_set: #? faster filter of labels because we don't add ancestor anyway
            #     continue
            # score *= 1 - alphas[go_rels.get_namespace(go_id)] # x *= 1-0.5 --> x = x * (1-0.5)
            # if go_id in annots_dict: #? should not need this line??
            #     annots_dict[go_id] += score #! add into blast score
            # else: #! are we going to see error??
            annots_dict[go_id] = score  #! replace blast score
        deep_preds.append(annots_dict)  #! later on, we use only @deep_preds

    # print('AUTHOR DeepGOPlus')
    # print('MODEL 1')
    # print('KEYWORDS sequence alignment.')
    # for i, row in enumerate(test_df.itertuples()):
    #     prot_id = row.proteins
    #     for go_id, score in deep_preds[i].items():
    #         print(f'{prot_id}\t{go_id}\t{score:.2f}')
    # print('END')
    # return

    # Propagate scores
    # deepgo_preds = []
    # for annots_dict in deep_preds:
    #     annots = {}
    #     for go_id, score in annots_dict.items():
    #         for a_id in go_rels.get_anchestors(go_id):
    #             if a_id in annots:
    #                 annots[a_id] = max(annots[a_id], score)
    #             else:
    #                 annots[a_id] = score
    #     deepgo_preds.append(annots)

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    rus = []
    mis = []

    print('\nontology {}\n'.format(ont))

    ####

    for threshold in np.arange(0.005, .4, .01):  # np.arange(0.005,1,.01)
        # threshold = t / 100.0
        print('\n')
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in deep_preds[i].items():
                if go_id not in go_set:  #? faster filter of labels because we don't add ancestor anyway
                    continue
                if score >= threshold:
                    annots.add(go_id)

            preds.append(annots)

            ##!! append parent terms or something ??
            # new_annots = set()
            # for go_id in annots:
            #     new_annots |= go_rels.get_anchestors(go_id)
            # preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        # print ('see 1 prediction')
        # print (preds[10])
        # print ('see 1 label')
        # print (labels[10])

        fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
            go_rels, labels, preds)
        avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
        avg_ic = sum(
            map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                fps)) / len(fps)
        print(f'{avg_fp} {avg_ic}')
        precisions.append(prec)
        recalls.append(rec)
        print(
            f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
        )
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'\nFmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
Esempio n. 18
0
def main(train_data_file, test_data_file, ont):

  go_rels = Ontology('data/go.obo', with_rels=True)
  # terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl')
  # terms = terms_df['functions'].values.flatten()
  # terms_dict = {v: i for i, v in enumerate(terms)}

  train_df = pd.read_pickle(train_data_file)
  annotations = train_df['annotations'].values
  annotations = list(map(lambda x: set(x), annotations))

  test_df = pd.read_pickle(test_data_file)
  test_annotations = test_df['annotations'].values
  test_annotations = list(map(lambda x: set(x), test_annotations))

  go_rels.calculate_ic(annotations + test_annotations)

  go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
  go_set.remove(FUNC_DICT[ont])

  annotations = list(map(lambda x: set(filter(lambda y: y in go_set, x)), annotations))

  cnt = Counter()
  max_n = 0
  for x in annotations:
    cnt.update(x)
  print(cnt.most_common(10))
  max_n = cnt.most_common(1)[0][1]
  print(max_n)
  scores = {}
  for go_id, n in cnt.items():
    score = n / max_n
    scores[go_id] = score #! IC score?

  prot_index = {}
  for i, row in enumerate(train_df.itertuples()):
    prot_index[row.proteins] = i


  labels = test_annotations
  labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
  print(len(go_set))
  fmax = 0.0
  tmax = 0.0
  smin = 1000.0
  precisions = []
  recalls = []
  for threshold in np.arange(0.005,.5,.01): # 
    # threshold = t / 100.0
    preds = []
    annots = set()
    for go_id, score in scores.items():
      if score >= threshold:
        annots.add(go_id)
      # new_annots = set()
      # for go_id in annots:
      #     new_annots |= go_rels.get_anchestors(go_id)
      # new_annots = set(filter(lambda y: y in go_set, new_annots))
    for i, row in enumerate(test_df.itertuples()):
      preds.append(annots.copy())

    fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds)
    precisions.append(prec)
    recalls.append(rec)
    print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
    if fmax < fscore:
      fmax = fscore
      tmax = threshold
    if smin > s:
      smin = s
  print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
  precisions = np.array(precisions)
  recalls = np.array(recalls)
  sorted_index = np.argsort(recalls)
  recalls = recalls[sorted_index]
  precisions = precisions[sorted_index]
  aupr = np.trapz(precisions, recalls)
  print(f'AUPR: {aupr:0.3f}')
  plt.figure()
  lw = 2
  plt.plot(recalls, precisions, color='darkorange',
       lw=lw, label=f'AUPR curve (area = {aupr:0.3f})')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.title('Area Under the Precision-Recall curve')
  plt.legend(loc="lower right")
  plt.savefig('aupr.pdf')
  plt.show()
def main(benchmark_file, train_data_file, hpo_file, terms_file, root_class):

    hp = Ontology(hpo_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    noknowledge_prots = set()
    with open('data-cafa/noknowledge_targets.txt') as f:
        for line in f:
            noknowledge_prots.add(line.strip())

    bench_annots = {}
    with open(benchmark_file) as f:
        for line in f:
            it = line.strip().split('\t')
            t_id = it[0]
            if t_id not in noknowledge_prots:
                continue
            hp_id = it[1]
            if t_id not in bench_annots:
                bench_annots[t_id] = set()
            bench_annots[t_id] |= hp.get_anchestors(hp_id)

    train_df = pd.read_pickle(train_data_file)
    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = []
    for t_id, hps in bench_annots.items():
        labels.append(hps)
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(bench_annots), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(bench_annots), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return

    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)
        preds = []
        for t_id, hps in bench_annots.items():
            preds.append(new_annots)

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            pmax = prec
            rmax = rec
            tmax = threshold
            max_preds = preds
        if smin > s:
            smin = s
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )
Esempio n. 20
0
def main(go_file, train_data_file, test_data_file, terms_file, model_file,
         out_file, split, batch_size, epochs, load, logger_file, threshold,
         device, params_index):
    params = {
        'max_kernel': 129,
        'initializer': 'glorot_normal',
        'dense_depth': 0,
        'nb_filters': 512,
        'optimizer': Adam(lr=3e-4),
        'loss': 'binary_crossentropy'
    }
    # SLURM JOB ARRAY INDEX
    pi = params_index
    if params_index != -1:
        kernels = [33, 65, 129, 257, 513]
        dense_depths = [0, 1, 2]
        nb_filters = [32, 64, 128, 256, 512]
        params['max_kernel'] = kernels[pi % 5]
        pi //= 5
        params['dense_depth'] = dense_depths[pi % 3]
        pi //= 3
        params['nb_filters'] = nb_filters[pi % 5]
        pi //= 5
        out_file = f'data/predictions_{params_index}.pkl'
        logger_file = f'data/training_{params_index}.csv'
        model_file = f'data/model_{params_index}.h5'
    print('Params:', params)

    go = Ontology(go_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()

    train_df, valid_df = load_data(train_data_file, terms, split)
    test_df = pd.read_pickle(test_data_file)
    terms_dict = {v: i for i, v in enumerate(terms)}
    nb_classes = len(terms)
    with tf.device('/' + device):
        test_steps = int(math.ceil(len(test_df) / batch_size))
        test_generator = DFGenerator(test_df, terms_dict, nb_classes,
                                     batch_size)
        if load:
            logging.info('Loading pretrained model')
            model = load_model(model_file)
        else:
            logging.info('Creating a new model')
            model = create_model(nb_classes, params)

            logging.info("Training data size: %d" % len(train_df))
            logging.info("Validation data size: %d" % len(valid_df))
            checkpointer = ModelCheckpoint(filepath=model_file,
                                           verbose=1,
                                           save_best_only=True)
            earlystopper = EarlyStopping(monitor='val_loss',
                                         patience=6,
                                         verbose=1)
            logger = CSVLogger(logger_file)

            logging.info('Starting training the model')

            valid_steps = int(math.ceil(len(valid_df) / batch_size))
            train_steps = int(math.ceil(len(train_df) / batch_size))
            train_generator = DFGenerator(train_df, terms_dict, nb_classes,
                                          batch_size)
            valid_generator = DFGenerator(valid_df, terms_dict, nb_classes,
                                          batch_size)

            model.summary()
            model.fit(train_generator,
                      steps_per_epoch=train_steps,
                      epochs=epochs,
                      validation_data=valid_generator,
                      validation_steps=valid_steps,
                      max_queue_size=batch_size,
                      workers=12,
                      callbacks=[logger, checkpointer, earlystopper])
            logging.info('Loading best model')
            model = load_model(model_file)

        logging.info('Evaluating model')
        loss = model.evaluate(test_generator, steps=test_steps)
        logging.info('Test loss %f' % loss)
        logging.info('Predicting')
        test_generator.reset()
        preds = model.predict(test_generator, steps=test_steps)

        # valid_steps = int(math.ceil(len(valid_df) / batch_size))
        # valid_generator = DFGenerator(valid_df, terms_dict,
        #                               nb_classes, batch_size)
        # logging.info('Predicting')
        # valid_generator.reset()
        # preds = model.predict_generator(valid_generator, steps=valid_steps)
        # valid_df.reset_index()
        # valid_df['preds'] = list(preds)
        # train_df.to_pickle('data/train_data_train.pkl')
        # valid_df.to_pickle('data/train_data_valid.pkl')

    test_labels = np.zeros((len(test_df), nb_classes), dtype=np.int32)
    for i, row in enumerate(test_df.itertuples()):
        for go_id in row.prop_annotations:
            if go_id in terms_dict:
                test_labels[i, terms_dict[go_id]] = 1
    logging.info('Computing performance:')
    roc_auc = compute_roc(test_labels, preds)
    logging.info('ROC AUC: %.2f' % (roc_auc, ))
    test_df['labels'] = list(test_labels)
    test_df['preds'] = list(preds)

    logging.info('Saving predictions')
    test_df.to_pickle(out_file)
Esempio n. 21
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file,
         ont):

    go_rels = Ontology('data/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    print("Length of test set: " + str(len(test_df)))

    annotations = train_df['prop_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['prop_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    #print('Diamond preds')
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    last_release_metadata = 'metadata/last_release.json'

    with open(last_release_metadata, 'r') as f:
        last_release_data = json.load(f)

    last_release_data['alphas'][ont] = find_alpha(ont, test_df, blast_preds,
                                                  go_rels, terms)

    with open(last_release_metadata, 'w') as f:
        json.dump(last_release_data, f)
Esempio n. 22
0
def main(in_file, outdirectory, go_file, model_file, terms_file,
         annotations_file, chunk_size, diamond_file, threshold, batch_size,
         alpha):

    global GO, Terms, annotations, diamond_predictions

    Path(outdirectory).mkdir(exist_ok=True, parents=True)

    # Load GO and read list of all terms
    GO = Ontology(go_file, with_rels=True)
    Terms = pd.read_pickle(terms_file).terms.values.flatten()

    # Read known experimental annotations
    annotations = _collate_experimental_annotations(annotations_file)
    print("{:<20s} ✓".format("Annotations"))

    # Parse diamond prediction file
    diamond_predictions = _collate_diamond_predictions(diamond_file,
                                                       annotations)
    print("{:<20s} ✓".format("Diamond predictions"))
    Ns = len(diamond_predictions)

    # Load CNN model
    model = load_model(model_file)
    print("{:<20s} ✓".format("CNN model"))
    total_seq = 0
    start_time = time.time()

    display = "{i} ({start}, {end}) | time={time}"
    calc_start = lambda i: i * chunk_size
    calc_end = lambda i: i * chunk_size + chunk_size - 1

    # creates a MP queue for depositing predictions and delegating to
    # tabulating processes and writer
    manager = multiprocessing.Manager()
    write_q = manager.Queue()

    worker = partial(_process_predictions, alpha=alpha, threshold=threshold)
    pool = multiprocessing.Pool(processes=25)
    # create the writer
    #pool.apply_async(_write_predictions, (write_q, outdirectory))

    results = []
    for i, (prot_ids, sequences) in enumerate(read_fasta(in_file, chunk_size)):
        total_seq += len(prot_ids)
        ids, data = get_data(sequences)

        start = datetime.now()
        # make a prediction
        preds = model.predict(data, batch_size=batch_size)
        assert preds.shape[1] == len(Terms)
        # display the time it took and other helpful data
        elapsed = _calc_elapsed(start, fmt=True)
        print(
            display.format(i=i,
                           start=calc_start(i),
                           end=calc_end(i),
                           time=_calc_elapsed(start)))

        # fire up a prediction tabulator
        #result = worker(ids, preds, prot_ids)
        result = pool.apply_async(worker, (ids, preds, prot_ids))

        results.append(result)

    for annot in chain.from_iterable(map(lambda result: result.get(),
                                         results)):
        #for annot in result.get():
        filename = Path(outdirectory) / f"{annot['accession']}.json"
        _write_result(annot, filename)

    pool.close()
    pool.join()
    total_time = time.time() - start_time
    print('Total prediction time for %d sequences is %d' %
          (total_seq, total_time))
def main(go_file, train_data_file, valid_data_file, test_data_file,
         sim_score_file):
    go = Ontology(go_file, with_rels=False)

    with open(sim_score_file, 'r') as f:
        proteins = next(f).strip().split('\t')
        prots_dict = {v: k for k, v in enumerate(proteins)}
        sim = np.zeros((len(proteins), len(proteins)), dtype=np.float32)
        i = 0
        for line in f:
            line = line.replace('null', '0.0')
            s = line.strip().split('\t')
            s = np.array(list(map(float, s)), dtype=np.float32)
            sim[i, :] = s
            i += 1
    train_data = load_data(train_data_file, prots_dict)
    valid_data = load_data(valid_data_file, prots_dict)
    trlabels = np.ones((len(proteins), len(proteins)), dtype=np.int32)
    for c, d in train_data:
        trlabels[c, d] = 0
    for c, d in valid_data:
        trlabels[c, d] = 0

    test_data = load_data(test_data_file, prots_dict)
    top10 = 0
    top100 = 0
    mean_rank = 0
    ftop10 = 0
    ftop100 = 0
    fmean_rank = 0
    n = len(test_data)
    labels = np.zeros((len(proteins), len(proteins)), dtype=np.int32)
    ranks = {}
    franks = {}
    with ck.progressbar(test_data) as prog_data:
        for c, d in prog_data:
            labels[c, d] = 1
            index = rankdata(-sim[c, :], method='average')
            rank = index[d]
            if rank <= 10:
                top10 += 1
            if rank <= 100:
                top100 += 1
            mean_rank += rank
            if rank not in ranks:
                ranks[rank] = 0
            ranks[rank] += 1

            # Filtered rank
            fil = sim[c, :] * (labels[c, :] | trlabels[c, :])
            index = rankdata(-fil, method='average')
            rank = index[d]
            if rank <= 10:
                ftop10 += 1
            if rank <= 100:
                ftop100 += 1
            fmean_rank += rank
            if rank not in franks:
                franks[rank] = 0
            franks[rank] += 1

        print()
        top10 /= n
        top100 /= n
        mean_rank /= n
        ftop10 /= n
        ftop100 /= n
        fmean_rank /= n

        rank_auc = compute_rank_roc(ranks, len(proteins))
        frank_auc = compute_rank_roc(franks, len(proteins))
        print(f'{top10:.2f} {top100:.2f} {mean_rank:.2f} {rank_auc:.2f}')
        print(f'{ftop10:.2f} {ftop100:.2f} {fmean_rank:.2f} {frank_auc:.2f}')
Esempio n. 24
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    # DeepGOPlus
    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])
    labels = test_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    # print(len(go_set))
    deep_preds = []
    alphas = {
        NAMESPACES['mf']: 0.55,
        NAMESPACES['bp']: 0.59,
        NAMESPACES['cc']: 0.46
    }
    for i, row in enumerate(test_df.itertuples()):
        annots_dict = blast_preds[i].copy()
        for go_id in annots_dict:
            annots_dict[go_id] *= alphas[go_rels.get_namespace(go_id)]
        for j, score in enumerate(row.preds):
            go_id = terms[j]
            score *= 1 - alphas[go_rels.get_namespace(go_id)]
            if go_id in annots_dict:
                annots_dict[go_id] += score
            else:
                annots_dict[go_id] = score
        deep_preds.append(annots_dict)
    print('AUTHOR DeepGOPlus')
    print('MODEL 1')
    print('KEYWORDS sequence alignment.')
    for i, row in enumerate(test_df.itertuples()):
        prot_id = row.proteins
        for go_id, score in deep_preds[i].items():
            print(f'{prot_id}\t{go_id}\t{score:.2f}')
    print('END')
    return
    # Propagate scores
    # deepgo_preds = []
    # for annots_dict in deep_preds:
    #     annots = {}
    #     for go_id, score in annots_dict.items():
    #         for a_id in go_rels.get_anchestors(go_id):
    #             if a_id in annots:
    #                 annots[a_id] = max(annots[a_id], score)
    #             else:
    #                 annots[a_id] = score
    #     deepgo_preds.append(annots)

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    rus = []
    mis = []
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in deep_preds[i].items():
                if score >= threshold:
                    annots.add(go_id)

            new_annots = set()
            for go_id in annots:
                new_annots |= go_rels.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
            go_rels, labels, preds)
        avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
        avg_ic = sum(
            map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                fps)) / len(fps)
        print(f'{avg_fp} {avg_ic}')
        precisions.append(prec)
        recalls.append(rec)
        print(
            f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
        )
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
Esempio n. 25
0
def main(train_data_file, valid_data_file, terms_file, diamond_scores_file,
         ont):

    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    valid_df = pd.read_pickle(valid_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    valid_annotations = valid_df['annotations'].values
    valid_annotations = list(map(lambda x: set(x), valid_annotations))
    go_rels.calculate_ic(annotations + valid_annotations)
    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(valid_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    # DeepGOPlus
    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])
    labels = valid_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    best_fmax = 0.0
    best_alpha = 0.0
    for alpha in range(44, 70):
        alpha /= 100.0
        deep_preds = []
        for i, row in enumerate(valid_df.itertuples()):
            annots_dict = blast_preds[i].copy()
            for go_id in annots_dict:
                annots_dict[go_id] *= alpha
            for j, score in enumerate(row.preds):
                go_id = terms[j]
                score *= 1 - alpha
                if go_id in annots_dict:
                    annots_dict[go_id] += score
                else:
                    annots_dict[go_id] = score
            deep_preds.append(annots_dict)

        fmax = 0.0
        tmax = 0.0
        precisions = []
        recalls = []
        smin = 1000000.0
        rus = []
        mis = []
        for t in range(14, 20):
            threshold = t / 100.0
            preds = []
            for i, row in enumerate(valid_df.itertuples()):
                annots = set()
                for go_id, score in deep_preds[i].items():
                    if score >= threshold:
                        annots.add(go_id)

                new_annots = set()
                for go_id in annots:
                    new_annots |= go_rels.get_anchestors(go_id)
                preds.append(new_annots)

            # Filter classes
            preds = list(
                map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

            fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
                go_rels, labels, preds)
            avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
            avg_ic = sum(
                map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                    fps)) / len(fps)
            print(
                f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
            )
            if fmax < fscore:
                fmax = fscore
                tmax = threshold
            if smin > s:
                smin = s
        if best_fmax < fmax:
            best_fmax = fmax
            best_alpha = alpha
        print(
            f'Alpha: {alpha} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
        )
    print(f'{best_alpha} {best_fmax}')
def main(go_file, train_data_file, valid_data_file, test_data_file,
         cls_embeds_file, rel_embeds_file, margin, params_array_index):
    embedding_size = 50
    reg_norm = 1
    org = 'human'
    go = Ontology(go_file, with_rels=False)
    pai = params_array_index
    if params_array_index != -1:
        orgs = ['human', 'yeast']
        sizes = [50, 100, 200, 400]
        margins = [-0.1, -0.01, 0.0, 0.01, 0.1]
        reg_norms = [1,]
        reg_norm = reg_norms[0]
        # params_array_index //= 2
        margin = margins[params_array_index % 5]
        params_array_index //= 5
        embedding_size = sizes[params_array_index % 4]
        params_array_index //= 4
        org = orgs[params_array_index % 2]
        print('Params:', org, embedding_size, margin, reg_norm)
        if org == 'human':
            train_data_file = f'data/data-train/9606.protein.links.v10.5.txt'
            valid_data_file = f'data/data-valid/9606.protein.links.v10.5.txt'
            test_data_file = f'data/data-test/9606.protein.links.v10.5.txt'
        cls_embeds_file = f'data/{org}_{pai}_{embedding_size}_{margin}_{reg_norm}_cls.pkl'
        rel_embeds_file = f'data/{org}_{pai}_{embedding_size}_{margin}_{reg_norm}_rel.pkl'
        loss_file = f'data/{org}_{pai}_{embedding_size}_{margin}_{reg_norm}_loss.csv'
        if os.path.exists(loss_file):
            df = pd.read_csv(loss_file)
            print('Loss:', df['loss'].values[-1])


    cls_df = pd.read_pickle(cls_embeds_file)
    rel_df = pd.read_pickle(rel_embeds_file)
    nb_classes = len(cls_df)
    nb_relations = len(rel_df)
    embeds_list = cls_df['embeddings'].values
    classes = {v: k for k, v in enumerate(cls_df['classes'])}
    rembeds_list = rel_df['embeddings'].values
    relations = {v: k for k, v in enumerate(rel_df['relations'])}
    size = len(embeds_list[0])
    embeds = np.zeros((nb_classes, size), dtype=np.float32)
    for i, emb in enumerate(embeds_list):
        embeds[i, :] = emb
    proteins = {}
    for k, v in classes.items():
        if not k.startswith('<http://purl.obolibrary.org/obo/GO_'):
            proteins[k] = v
    rs = np.abs(embeds[:, -1]).reshape(-1, 1)
    embeds = embeds[:, :-1]
    prot_index = list(proteins.values())
    prot_rs = rs[prot_index, :]
    prot_embeds = embeds[prot_index, :]
    prot_dict = {v: k for k, v in enumerate(prot_index)}
    
    rsize = len(rembeds_list[0])
    rembeds = np.zeros((nb_relations, rsize), dtype=np.float32)
    for i, emb in enumerate(rembeds_list):
        rembeds[i, :] = emb
    train_data = load_data(train_data_file, classes, relations)
    valid_data = load_data(valid_data_file, classes, relations)
    trlabels = {}
    for c, r, d in train_data:
        c, r, d = prot_dict[classes[c]], relations[r], prot_dict[classes[d]]
        if r not in trlabels:
            trlabels[r] = np.ones((len(prot_embeds), len(prot_embeds)), dtype=np.int32)
        trlabels[r][c, d] = 1000
    # for c, r, d in valid_data:
    #     c, r, d = prot_dict[classes[c]], relations[r], prot_dict[classes[d]]
    #     if r not in trlabels:
    #         trlabels[r] = np.ones((len(prot_embeds), len(prot_embeds)), dtype=np.int32)
    #     trlabels[r][c, d] = 1000

    test_data = load_data(test_data_file, classes, relations)
    top1 = 0
    top10 = 0
    top100 = 0
    mean_rank = 0
    ftop1 = 0
    ftop10 = 0
    ftop100 = 0
    fmean_rank = 0
    labels = {}
    preds = {}
    ranks = {}
    franks = {}
    eval_data = test_data
    n = len(eval_data)
    with ck.progressbar(eval_data) as prog_data:
        for c, r, d in prog_data:
            c, r, d = prot_dict[classes[c]], relations[r], prot_dict[classes[d]]
            if r not in labels:
                labels[r] = np.zeros((len(prot_embeds), len(prot_embeds)), dtype=np.int32)
            if r not in preds:
                preds[r] = np.zeros((len(prot_embeds), len(prot_embeds)), dtype=np.float32)
            labels[r][c, d] = 1
            ec = prot_embeds[c, :]
            rc = prot_rs[c, :]
            er = rembeds[r, :]
            ec += er

            dst = np.linalg.norm(prot_embeds - ec.reshape(1, -1), axis=1)
            dst = dst.reshape(-1, 1)
            # if rc > 0:
            #     overlap = np.maximum(0, (2 * rc - np.maximum(dst + rc - prot_rs - margin, 0)) / (2 * rc))
            # else:
            #     overlap = (np.maximum(dst - prot_rs - margin, 0) == 0).astype('float32')
            
            # edst = np.maximum(0, dst - rc - prot_rs - margin)
            # res = (overlap + 1 / np.exp(edst)) / 2
            res = np.maximum(0, dst - rc - prot_rs - margin)
            res = res.flatten()
            preds[r][c, :] = res
            index = rankdata(res, method='average')
            rank = index[d]
            if rank == 1:
                top1 += 1
            if rank <= 10:
                top10 += 1
            if rank <= 100:
                top100 += 1
            mean_rank += rank
            if rank not in ranks:
                ranks[rank] = 0
            ranks[rank] += 1

            # Filtered rank
            index = rankdata((res * trlabels[r][c, :]), method='average')
            rank = index[d]
            if rank == 1:
                ftop1 += 1
            if rank <= 10:
                ftop10 += 1
            if rank <= 100:
                ftop100 += 1
            fmean_rank += rank

            if rank not in franks:
                franks[rank] = 0
            franks[rank] += 1
        top1 /= n
        top10 /= n
        top100 /= n
        mean_rank /= n
        ftop1 /= n
        ftop10 /= n
        ftop100 /= n
        fmean_rank /= n

    rank_auc = compute_rank_roc(ranks, len(proteins))
    frank_auc = compute_rank_roc(franks, len(proteins))
    
    print(f'{org} {embedding_size} {margin} {reg_norm} {top10:.2f} {top100:.2f} {mean_rank:.2f} {rank_auc:.2f}')
    print(f'{org} {embedding_size} {margin} {reg_norm} {ftop10:.2f} {ftop100:.2f} {fmean_rank:.2f} {frank_auc:.2f}')
Esempio n. 27
0
def main(in_file, out_file, go_file, model_file, terms_file, annotations_file,
         chunk_size, diamond_file, threshold, batch_size, alpha):
    # Load GO and read list of all terms
    go = Ontology(go_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()

    # Read known experimental annotations
    annotations = {}
    df = pd.read_pickle(annotations_file)
    for row in df.itertuples():
        annotations[row.proteins] = set(row.prop_annotations)

    go.calculate_ic(annotations.values())

    diamond_preds = {}
    mapping = {}
    with gzip.open(diamond_file, 'rt') as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in mapping:
                mapping[it[0]] = {}
            mapping[it[0]][it[1]] = float(it[2])
    for prot_id, sim_prots in mapping.items():
        annots = {}
        allgos = set()
        total_score = 0.0
        for p_id, score in sim_prots.items():
            allgos |= annotations[p_id]
            total_score += score
        allgos = list(sorted(allgos))
        sim = np.zeros(len(allgos), dtype=np.float32)
        for j, go_id in enumerate(allgos):
            s = 0.0
            for p_id, score in sim_prots.items():
                if go_id in annotations[p_id]:
                    s += score
            sim[j] = s / total_score
        for go_id, score in zip(allgos, sim):
            annots[go_id] = score
        diamond_preds[prot_id] = annots
    
    # Load CNN model
    model = load_model(model_file)
    # Alphas for the latest model
    alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46}
    # Alphas for the cafa2 model
    # alphas = {NAMESPACES['mf']: 0.63, NAMESPACES['bp']: 0.68, NAMESPACES['cc']: 0.48}
    
    start_time = time.time()
    total_seq = 0
    w = gzip.open(out_file, 'wt')
    for prot_ids, sequences in read_fasta(in_file, chunk_size):
        total_seq += len(prot_ids)
        deep_preds = {}
        ids, data = get_data(sequences)

        preds = model.predict(data, batch_size=batch_size)
        assert preds.shape[1] == len(terms)
        for i, j in enumerate(ids):
            prot_id = prot_ids[j]
            if prot_id not in deep_preds:
                deep_preds[prot_id] = {}
            for l in range(len(terms)):
                if preds[i, l] >= 0.01: # Filter out very low scores
                    if terms[l] not in deep_preds[prot_id]:
                        deep_preds[prot_id][terms[l]] = preds[i, l]
                    else:
                        deep_preds[prot_id][terms[l]] = max(
                            deep_preds[prot_id][terms[l]], preds[i, l])
        # Combine diamond preds and deepgo
        for prot_id in prot_ids:
            annots = {}
            if prot_id in diamond_preds:
                for go_id, score in diamond_preds[prot_id].items():
                    annots[go_id] = score * alphas[go.get_namespace(go_id)]
            for go_id, score in deep_preds[prot_id].items():
                if go_id in annots:
                    annots[go_id] += (1 - alphas[go.get_namespace(go_id)]) * score
                else:
                    annots[go_id] = (1 - alphas[go.get_namespace(go_id)]) * score
            # Propagate scores with ontology structure
            gos = list(annots.keys())
            for go_id in gos:
                for g_id in go.get_anchestors(go_id):
                    if g_id in annots:
                        annots[g_id] = max(annots[g_id], annots[go_id])
                    else:
                        annots[g_id] = annots[go_id]
                
            sannots = sorted(annots.items(), key=lambda x: x[1], reverse=True)
            for go_id, score in sannots:
                if score >= threshold:
                    w.write(prot_id + '\t' + go_id + '\t' + go.get_term(go_id)['name'] + '\t%.2f' % go.get_ic(go_id) + '\t%.3f\n' % score)
            w.write('\n')
    w.close()
    total_time = time.time() - start_time
    print('Total prediction time for %d sequences is %d' % (total_seq, total_time))
Esempio n. 28
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    mp = Ontology('data/mp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['mp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['mp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    mp.calculate_ic(annotations)
    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # GO2HP preds
    rules = {}
    with open('data/go2hp.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            go_id = it[0].replace('_', ':')
            mp_ids = list(map(lambda x: x.replace('_', ':'), it[1:]))
            if go_id not in rules:
                rules[go_id] = []
            rules[go_id] = mp_ids
    pheno2go_preds = {}
    for i, row in enumerate(test_df.itertuples()):
        prot_id = row.proteins
        if prot_id not in pheno2go_preds:
            pheno2go_preds[prot_id] = {}
        for item in row.deepgo_annotations:
            go_id, score = item.split('|')
            if go_id in rules:
                for mp_id in rules[go_id]:
                    pheno2go_preds[prot_id][mp_id] = max(
                        float(score), pheno2go_preds[prot_id].get(mp_id, 0))

    labels = test_annotations
    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    for t in range(101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            prot_id = row.proteins
            annots_dict = {}  #pheno2go_preds[prot_id]
            for j, score in enumerate(row.preds):
                mp_id = terms[j]
                annots_dict[mp_id] = max(score, annots_dict.get(mp_id, 0))

            annots = set()
            for mp_id, score in annots_dict.items():
                if score >= threshold:
                    annots.add(mp_id)
            new_annots = set()
            for mp_id in annots:
                new_annots |= mp.get_anchestors(mp_id)
            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(mp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
Esempio n. 29
0
def main(hp_file, train_data_file, terms_file, dis_phenotypes, omim_file,
         predictions_file, gene_annots_file, dis_annots_file, fold):

    hp = Ontology(hp_file, with_rels=True)
    print('HP loaded')
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    predictions_file = f'fold{fold}_exp-' + predictions_file
    gene_annots_file = f'fold{fold}_exp-' + gene_annots_file
    dis_annots_file = f'fold{fold}_exp-' + dis_annots_file
    real_annots_file = f'fold{fold}_exp-data/gene_annotations_real.tab'

    diseases = set()
    genes = set()
    with open(omim_file, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            it = line.strip().split('\t')
            omim_id = it[0].split(', ')[-1].split()[0]
            gene_symbols = it[1].split(', ')
            genes |= set(gene_symbols)
            diseases.add('OMIM:' + omim_id)
    print(len(diseases), len(genes))
    dis_annots = {}
    with open(dis_phenotypes) as f:
        for line in f:
            it = line.strip().split('\t')
            dis_id = it[0] + ':' + it[1]
            if dis_id not in diseases:
                continue
            hp_id = it[4]
            if not hp.has_term(hp_id):
                continue
            if dis_id not in dis_annots:
                dis_annots[dis_id] = set()
            dis_annots[dis_id].add(hp_id)

    with open(dis_annots_file, 'w') as w:
        for dis_id, annots in dis_annots.items():
            w.write(dis_id)
            for hp_id in annots:
                w.write('\t' + hp_id)
            w.write('\n')

    df = pd.read_pickle(predictions_file)
    with open(gene_annots_file, 'w') as w:
        for i, row in df.iterrows():
            w.write(row['genes'])
            for hp_id in row['hp_preds']:
                w.write('\t' + hp_id)
            w.write('\n')

    with open(real_annots_file, 'w') as w:
        for i, row in df.iterrows():
            w.write(row['genes'])
            for hp_id in row['hp_annotations']:
                w.write('\t' + hp_id)
            w.write('\n')
def main(go_file, mp_file, mp_annots_file, deepgo_annots_file, id_mapping_file,
         data_file, out_data_file, out_terms_file, min_count):
    go = Ontology(go_file, with_rels=True)
    logging.info('GO loaded')
    mp = Ontology(mp_file, with_rels=True)
    logging.info('MP loaded')

    logging.info('Load MP2Uniprot mapping')
    prot2gene = {}
    with open(id_mapping_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            if it[0] not in gene2prot:
                gene2prot[it[0]] = []
            gene2prot[it[0]] += list(it[6].split())
    logging.info('Loading MP annotations')
    mp_annots = {}
    df = pd.read_pickle(data_file)
    acc2prot = {}
    for row in df.itertuples():
        p_id = row.proteins
        acc_ids = row.accessions.split('; ')
        for acc_id in acc_ids:
            acc2prot[acc_id] = p_id
    with open(mp_annots_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            for mgi in it[6].split('|'):
                if mgi not in gene2prot:
                    continue
                prot_ids = gene2prot[mgi]
                mp_id = it[4]
                for prot_id in prot_ids:
                    if prot_id not in acc2prot:
                        continue
                    prot_id = acc2prot[prot_id]
                    if prot_id not in mp_annots:
                        mp_annots[prot_id] = set()
                        if mp.has_term(mp_id):
                            mp_annots[prot_id] |= mp.get_anchestors(mp_id)
    print('MP Annotations', len(mp_annots))
    dg_annots = {}
    gos = set()
    with open(deepgo_annots_file) as f:
        for line in f:
            it = line.strip().split('\t')
            prot_id = it[0]
            annots = []
            for item in it[1:]:
                go_id, score = item.split('|')
                score = float(score)
                annots.append(go_id)
            dg_annots[prot_id] = it[1:]
            gos |= set(annots)
    print('DeepGO Annotations', len(dg_annots))
    print('Number of GOs', len(gos))
    go_df = pd.DataFrame({'gos': list(gos)})
    go_df.to_pickle('data/gos.pkl')

    logging.info('Processing annotations')

    cnt = Counter()
    annotations = list()
    for prot_id, annots in mp_annots.items():
        for term in annots:
            cnt[term] += 1

    deepgo_annots = []
    go_annots = []
    mpos = []
    prots = []
    sequences = []
    for row in df.itertuples():
        p_id = row.proteins
        if p_id in mp_annots:
            prots.append(p_id)
            mpos.append(mp_annots[p_id])
            go_annots.append(row.annotations)
            deepgo_annots.append(dg_annots[p_id])
            sequences.append(row.sequences)

    prots_set = set(prots)
    for key, val in mp_annots.items():
        if key not in prots_set:
            print(key)

    df = pd.DataFrame({
        'proteins': prots,
        'mp_annotations': mpos,
        'go_annotations': go_annots,
        'deepgo_annotations': deepgo_annots,
        'sequences': sequences
    })
    df.to_pickle(out_data_file)
    print(f'Number of proteins {len(df)}')

    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if key == 'MP:0000001':
            continue
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)
    terms = []
    for key, val in res.items():
        print(key, len(val))
        terms += val

    logging.info(f'Number of terms {len(terms)}')

    # Save the list of terms
    df = pd.DataFrame({'terms': terms})
    df.to_pickle(out_terms_file)