Example #1
0
from itertools import combinations
import sys
import predict.preprocess as prep


if __name__ == "__main__":
    if len(sys.argv) < 5:
        print("Usage: python <Input predictions file> <Output results file> <Protein/Gene list> <Probability threshold>")
        sys.exit(0)

    interactome_file = sys.argv[1]
    out_file = sys.argv[2]
    protein_list = sys.argv[3]
    threshold = float(sys.argv[4])
    symbol_type = sys.argv[5]
    labels = sorted(prep.get_labels_from_file('data/labels.tsv'))
    train, _ = prep.prep_data_frames(selection=[])
    uniprots = list(train['uniprot'].values)
    use_protein = True

    try:
        interactome_fp = open(interactome_file, 'r')
    except IOError:
        print("Could not open supplied file {}.".format(interactome_file))
        sys.exit(0)

    try:
        protein_fp = open(protein_list, 'r')
        proteins = set()
        for line in protein_fp:
            xs = line.strip().upper()
Example #2
0
import pickle
import predict.preprocess as prep
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == "__main__":
    print("Loading datasets...")
    df_kegg = pickle.load(open("tmp/train_df.pkl", 'r'))
    df_hprd = pickle.load(open("tmp/test_df.pkl", 'r'))
    df_test = pickle.load(open("tmp/interactome_df.pkl", 'r'))
    selection = ['ipr', 'pfam', 'induced_go_cc', 'induced_go_mf', 'induced_go_bp']
    labels = prep.get_labels_from_file("data/labels.tsv")

    def get_selection(row):
        terms = []
        for col in selection:
            terms += [row[col]]
        terms = [t for t in ','.join(terms).split(',') if t.strip() != '']
        return ','.join(terms)

    df_test['terms'] = df_test.apply(get_selection, axis=1)
    df_train, df_hprd = prep.prep_data_frames(selection)

    vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True)
    vectorizer.fit(df_train['terms'].values)

    print("Transforming features...")
    x_train, y_train, feature_names, selector = prep.select_features(
        df = df_train,
        vectorizer=vectorizer,
        feature_col='terms',
Example #3
0
        'balanced': balanced,
        'induce': induce,
        'iteration': iterations,
        'cv_folds': cv_folds,
        'selection': selection,
        'ontologies': ontologies,
        'vectorizer_method': vectorizer_method,
        'permuted': permute,
        'scale': scale
    }
    pretty_print_dict(config)

    # ----------------------------- LOAD DATA ----------------------------------- #
    np.random.seed(42)
    developement_df, testing_df = prep.prep_data_frames(selection, load_interactome=False)
    labels = get_labels_from_file('data/labels.tsv')

    n = len(labels)
    split_train = {l:0 for l in labels}
    for l in labels:
        split_train[l] = sum(developement_df[l].values)

    split_test = {l:0 for l in labels}
    for l in labels:
        split_test[l] = sum(testing_df[l].values)

    n_samples_train = len(developement_df)
    n_samples_test = len(testing_df)

    # Create the appropriate statistics container for the whole experiment.
    training_stats = Statistics()
from __future__ import division

import predict.plotting as plot
import predict.preprocess as prep
import numpy as np
import sys

if __name__ == "__main__":
    fp = open(sys.argv[1], 'r')
    header = fp.readline().split('\t')
    labels = [elem.lower() for elem in prep.get_labels_from_file('data/labels.tsv')]
    idx = [header.index(elem) for elem in header if elem.lower() in labels]
    idx_start = min(idx)
    idx_end = max(idx)
    over_p = []

    ps = np.arange(0, 1.05, 0.05)
    probas = []
    for line in fp:
        xs = line.strip().split('\t')
        nums = [float(x) for x in xs[idx_start: idx_end + 1]]
        probas.append(nums)
    n = len(probas)

    for p in ps:
        n_over_p = sum([max(ls) >= p for ls in probas])
        over_p.append(n_over_p / n)

    for (x, y) in zip(ps, over_p):
        print('{}\t{}'.format(x, y))
    plot.plot(