Beispiel #1
0
def main(hp_file, train_data_file, terms_file, dis_phenotypes, omim_file,
         predictions_file, gene_annots_file, dis_annots_file, fold):

    hp = Ontology(hp_file, with_rels=True)
    print('HP loaded')
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    predictions_file = f'fold{fold}_exp-' + predictions_file
    gene_annots_file = f'fold{fold}_exp-' + gene_annots_file
    dis_annots_file = f'fold{fold}_exp-' + dis_annots_file
    real_annots_file = f'fold{fold}_exp-data/gene_annotations_real.tab'

    diseases = set()
    genes = set()
    with open(omim_file, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            it = line.strip().split('\t')
            omim_id = it[0].split(', ')[-1].split()[0]
            gene_symbols = it[1].split(', ')
            genes |= set(gene_symbols)
            diseases.add('OMIM:' + omim_id)
    print(len(diseases), len(genes))
    dis_annots = {}
    with open(dis_phenotypes) as f:
        for line in f:
            it = line.strip().split('\t')
            dis_id = it[0] + ':' + it[1]
            if dis_id not in diseases:
                continue
            hp_id = it[4]
            if not hp.has_term(hp_id):
                continue
            if dis_id not in dis_annots:
                dis_annots[dis_id] = set()
            dis_annots[dis_id].add(hp_id)

    with open(dis_annots_file, 'w') as w:
        for dis_id, annots in dis_annots.items():
            w.write(dis_id)
            for hp_id in annots:
                w.write('\t' + hp_id)
            w.write('\n')

    df = pd.read_pickle(predictions_file)
    with open(gene_annots_file, 'w') as w:
        for i, row in df.iterrows():
            w.write(row['genes'])
            for hp_id in row['hp_preds']:
                w.write('\t' + hp_id)
            w.write('\n')

    with open(real_annots_file, 'w') as w:
        for i, row in df.iterrows():
            w.write(row['genes'])
            for hp_id in row['hp_annotations']:
                w.write('\t' + hp_id)
            w.write('\n')
def main(go_file, mp_file, mp_annots_file, deepgo_annots_file, id_mapping_file,
         data_file, out_data_file, out_terms_file, min_count):
    go = Ontology(go_file, with_rels=True)
    logging.info('GO loaded')
    mp = Ontology(mp_file, with_rels=True)
    logging.info('MP loaded')

    logging.info('Load MP2Uniprot mapping')
    prot2gene = {}
    with open(id_mapping_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            if it[0] not in gene2prot:
                gene2prot[it[0]] = []
            gene2prot[it[0]] += list(it[6].split())
    logging.info('Loading MP annotations')
    mp_annots = {}
    df = pd.read_pickle(data_file)
    acc2prot = {}
    for row in df.itertuples():
        p_id = row.proteins
        acc_ids = row.accessions.split('; ')
        for acc_id in acc_ids:
            acc2prot[acc_id] = p_id
    with open(mp_annots_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            for mgi in it[6].split('|'):
                if mgi not in gene2prot:
                    continue
                prot_ids = gene2prot[mgi]
                mp_id = it[4]
                for prot_id in prot_ids:
                    if prot_id not in acc2prot:
                        continue
                    prot_id = acc2prot[prot_id]
                    if prot_id not in mp_annots:
                        mp_annots[prot_id] = set()
                        if mp.has_term(mp_id):
                            mp_annots[prot_id] |= mp.get_anchestors(mp_id)
    print('MP Annotations', len(mp_annots))
    dg_annots = {}
    gos = set()
    with open(deepgo_annots_file) as f:
        for line in f:
            it = line.strip().split('\t')
            prot_id = it[0]
            annots = []
            for item in it[1:]:
                go_id, score = item.split('|')
                score = float(score)
                annots.append(go_id)
            dg_annots[prot_id] = it[1:]
            gos |= set(annots)
    print('DeepGO Annotations', len(dg_annots))
    print('Number of GOs', len(gos))
    go_df = pd.DataFrame({'gos': list(gos)})
    go_df.to_pickle('data/gos.pkl')

    logging.info('Processing annotations')

    cnt = Counter()
    annotations = list()
    for prot_id, annots in mp_annots.items():
        for term in annots:
            cnt[term] += 1

    deepgo_annots = []
    go_annots = []
    mpos = []
    prots = []
    sequences = []
    for row in df.itertuples():
        p_id = row.proteins
        if p_id in mp_annots:
            prots.append(p_id)
            mpos.append(mp_annots[p_id])
            go_annots.append(row.annotations)
            deepgo_annots.append(dg_annots[p_id])
            sequences.append(row.sequences)

    prots_set = set(prots)
    for key, val in mp_annots.items():
        if key not in prots_set:
            print(key)

    df = pd.DataFrame({
        'proteins': prots,
        'mp_annotations': mpos,
        'go_annotations': go_annots,
        'deepgo_annotations': deepgo_annots,
        'sequences': sequences
    })
    df.to_pickle(out_data_file)
    print(f'Number of proteins {len(df)}')

    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if key == 'MP:0000001':
            continue
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)
    terms = []
    for key, val in res.items():
        print(key, len(val))
        terms += val

    logging.info(f'Number of terms {len(terms)}')

    # Save the list of terms
    df = pd.DataFrame({'terms': terms})
    df.to_pickle(out_terms_file)
Beispiel #3
0
def helper(train_df, test_df, ont):
    go = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle('data-cafa/' + ont + '.pkl')
    terms = terms_df['functions'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = train_df.rename(columns={"gos": "annotations"})
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = test_df.rename(columns={"gos": "annotations"})

    # Annotations
    test_annotations = []
    for i, row in enumerate(test_df.itertuples()):
        annots = set()
        for go_id in row.annotations:
            if go.has_term(go_id):
                annots |= go.get_anchestors(go_id)
        test_annotations.append(annots)
    go.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # DeepGO
    go_set = go.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for t in range(1, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for j, score in enumerate(row.predictions):
                if score >= threshold:
                    annots.add(terms[j])

            new_annots = set()
            for go_id in annots:
                new_annots |= go.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print('Fscore: {}, S: {}, threshold: {}'.format(fscore, s, threshold))
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print('Fmax: {:0.3f}, Smin: {:0.3f}, threshold: {}'.format(
        fmax, smin, tmax))
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print('AUPR: {:0.3f}'.format(aupr))

    return [recalls, precisions, aupr]
Beispiel #4
0
def main(train_data_file, preds_file, ont):

    go = Ontology('data/go.obo', with_rels=True)
    terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl')
    terms = terms_df['functions'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = pd.read_pickle(preds_file)
    # Annotations
    test_annotations = []
    for i, row in enumerate(test_df.itertuples()):
        annots = set()
        for go_id in row.gos:
            if go.has_term(go_id):
                annots |= go.get_anchestors(go_id)
        test_annotations.append(annots)
    go.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # DeepGO
    go_set = go.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for threshold in np.arange(0.005, 1, .01):
        # threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for j, score in enumerate(row.predictions):
                if score >= threshold:
                    annots.add(terms[j])

            new_annots = set()
            for go_id in annots:
                new_annots |= go.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.3f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig('aupr.pdf')
    plt.show()
Beispiel #5
0
def main(go_file, hp_file, hp_annots_file, deepgo_annots_file, data_file,
         expressions_file, out_data_file, out_terms_file, min_count):
    go = Ontology(go_file, with_rels=True)
    print('GO loaded')
    hp = Ontology(hp_file, with_rels=True)
    print('HP loaded')

    iea_annots = {}
    go_annots = {}
    seqs = {}
    df = pd.read_pickle(data_file)
    df = df[df['orgs'] == '9606']

    acc2prot = {}
    name2prot = {}
    for i, row in df.iterrows():
        accs = row['accessions'].split('; ')
        names = row['gene_names']
        p_id = row['proteins']
        for acc in accs:
            acc2prot[acc] = p_id
        for name in names:
            name = name.upper()
            if name not in name2prot:
                name2prot[name] = set()
            name2prot[name].add(p_id)
        if p_id not in go_annots:
            go_annots[p_id] = set()
        if p_id not in iea_annots:
            iea_annots[p_id] = set()
        go_annots[p_id] |= set(row.exp_annotations)
        iea_annots[p_id] |= set(row.iea_annotations)
        seqs[p_id] = row.sequences

    print('GO Annotations', len(go_annots))

    print('Loading HP annotations')
    hp_annots = {}
    unrev = set()
    with open(hp_annots_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            acc_id = it[0]
            hp_id = it[1]
            if acc_id not in acc2prot:
                unrev.add(acc_id)
                continue
            p_id = acc2prot[acc_id]
            if p_id not in hp_annots:
                hp_annots[p_id] = set()
            if hp.has_term(hp_id):
                hp_annots[p_id] |= hp.get_anchestors(hp_id)

    print('HP Annotations', len(hp_annots))
    dg_annots = {}
    gos = set()
    with open(deepgo_annots_file) as f:
        for line in f:
            it = line.strip().split('\t')
            p_id = it[0]
            annots = dg_annots.get(p_id, {})
            for item in it[1:]:
                go_id, score = item.split('|')
                score = float(score)
                annots[go_id] = max(score, annots.get(go_id, 0))
            dg_annots[p_id] = annots
            gos |= set(annots.keys())
    print('DeepGO Annotations', len(dg_annots))
    deepgo_annots = {}
    for g_id, annots in dg_annots.items():
        deepgo_annots[g_id] = [
            go_id + '|' + str(score) for go_id, score in annots.items()
        ]
    print('Number of GOs', len(gos))
    df = pd.DataFrame({'gos': list(gos)})
    #df.to_pickle('data-cafa/gos.pkl')

    logging.info('Processing annotations')

    cnt = Counter()
    annotations = list()
    for p_id, annots in hp_annots.items():
        for term in annots:
            cnt[term] += 1

    gene_exp = {}
    max_val = 0
    with open(expressions_file) as f:
        for line in f:
            if line.startswith('#') or line.startswith('Gene'):
                continue
            it = line.strip().split('\t')
            gene_name = it[1].upper()
            if gene_name in name2prot:
                exp = np.zeros((53, ), dtype=np.float32)
                for i in range(len(it[2:])):
                    exp[i] = float(it[2 + i]) if it[2 + i] != '' else 0.0
                for p_id in name2prot[gene_name]:
                    gene_exp[p_id] = exp / np.max(exp)

    print('Expression values', len(gene_exp))

    deepgo_annotations = []
    go_annotations = []
    iea_annotations = []
    hpos = []
    proteins = []
    sequences = []
    expressions = []
    mis_exp = 0
    for p_id, phenos in hp_annots.items():
        if p_id not in dg_annots:
            continue
        proteins.append(p_id)
        hpos.append(phenos)
        go_annotations.append(go_annots[p_id])
        iea_annotations.append(iea_annots[p_id])
        deepgo_annotations.append(deepgo_annots[p_id])
        sequences.append(seqs[p_id])
        if p_id in gene_exp:
            expressions.append(gene_exp[p_id])
        else:
            expressions.append(np.zeros((53, ), dtype=np.float32))
            mis_exp += 1

    print('Missing expressions', mis_exp)
    df = pd.DataFrame({
        'proteins': proteins,
        'hp_annotations': hpos,
        'go_annotations': go_annotations,
        'iea_annotations': iea_annotations,
        'deepgo_annotations': deepgo_annotations,
        'sequences': sequences,
        'expressions': expressions
    })
    df.to_pickle(out_data_file)
    print(f'Number of proteins {len(df)}')

    test_annots = {}
    tar2prot = {}
    with open('data-cafa/tar2prot.txt') as f:
        for line in f:
            it = line[1:].strip().split()
            tar2prot[it[0]] = it[1]

    unknown_prots = set()
    with open('data-cafa/benchmark/groundtruth/leafonly_HPO.txt') as f:
        for line in f:
            it = line.strip().split()
            p_id = tar2prot[it[0]]
            if p_id in hp_annots:
                continue
            unknown_prots.add(it[0])
            hp_id = it[1]
            if p_id not in test_annots:
                test_annots[p_id] = set()
            if hp.has_term(hp_id):
                test_annots[p_id] |= hp.get_anchestors(hp_id)
    with open('data-cafa/noknowledge_targets.txt', 'w') as f:
        for t_id in unknown_prots:
            f.write(t_id + '\n')

    deepgo_annotations = []
    go_annotations = []
    iea_annotations = []
    hpos = []
    proteins = []
    sequences = []
    expressions = []
    mis_exp = 0
    for p_id, phenos in test_annots.items():
        if p_id not in dg_annots:
            continue
        proteins.append(p_id)
        hpos.append(phenos)
        go_annotations.append(go_annots[p_id])
        iea_annotations.append(iea_annots[p_id])
        deepgo_annotations.append(deepgo_annots[p_id])
        sequences.append(seqs[p_id])
        if p_id in gene_exp:
            expressions.append(gene_exp[p_id])
        else:
            expressions.append(np.zeros((53, ), dtype=np.float32))
            mis_exp += 1

    df = pd.DataFrame({
        'proteins': proteins,
        'hp_annotations': hpos,
        'go_annotations': go_annotations,
        'iea_annotations': iea_annotations,
        'deepgo_annotations': deepgo_annotations,
        'sequences': sequences,
        'expressions': expressions
    })
    print('Missing expressions test', mis_exp)
    df.to_pickle('data-cafa/human_test.pkl')
    print(f'Number of test proteins {len(df)}')

    # Filter terms with annotations more than min_count
    terms_set = set()
    all_terms = []
    for key, val in cnt.items():
        if key == 'HP:0000001':
            continue
        all_terms.append(key)
        if val >= min_count:
            terms_set.add(key)
    terms = []
    for t_id in hp.get_ordered_terms():
        if t_id in terms_set:
            terms.append(t_id)

    logging.info(f'Number of terms {len(terms)}')