from itertools import combinations import sys import predict.preprocess as prep if __name__ == "__main__": if len(sys.argv) < 5: print("Usage: python <Input predictions file> <Output results file> <Protein/Gene list> <Probability threshold>") sys.exit(0) interactome_file = sys.argv[1] out_file = sys.argv[2] protein_list = sys.argv[3] threshold = float(sys.argv[4]) symbol_type = sys.argv[5] labels = sorted(prep.get_labels_from_file('data/labels.tsv')) train, _ = prep.prep_data_frames(selection=[]) uniprots = list(train['uniprot'].values) use_protein = True try: interactome_fp = open(interactome_file, 'r') except IOError: print("Could not open supplied file {}.".format(interactome_file)) sys.exit(0) try: protein_fp = open(protein_list, 'r') proteins = set() for line in protein_fp: xs = line.strip().upper()
import pickle import predict.preprocess as prep import numpy as np from sklearn.feature_extraction.text import CountVectorizer if __name__ == "__main__": print("Loading datasets...") df_kegg = pickle.load(open("tmp/train_df.pkl", 'r')) df_hprd = pickle.load(open("tmp/test_df.pkl", 'r')) df_test = pickle.load(open("tmp/interactome_df.pkl", 'r')) selection = ['ipr', 'pfam', 'induced_go_cc', 'induced_go_mf', 'induced_go_bp'] labels = prep.get_labels_from_file("data/labels.tsv") def get_selection(row): terms = [] for col in selection: terms += [row[col]] terms = [t for t in ','.join(terms).split(',') if t.strip() != ''] return ','.join(terms) df_test['terms'] = df_test.apply(get_selection, axis=1) df_train, df_hprd = prep.prep_data_frames(selection) vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True) vectorizer.fit(df_train['terms'].values) print("Transforming features...") x_train, y_train, feature_names, selector = prep.select_features( df = df_train, vectorizer=vectorizer, feature_col='terms',
'balanced': balanced, 'induce': induce, 'iteration': iterations, 'cv_folds': cv_folds, 'selection': selection, 'ontologies': ontologies, 'vectorizer_method': vectorizer_method, 'permuted': permute, 'scale': scale } pretty_print_dict(config) # ----------------------------- LOAD DATA ----------------------------------- # np.random.seed(42) developement_df, testing_df = prep.prep_data_frames(selection, load_interactome=False) labels = get_labels_from_file('data/labels.tsv') n = len(labels) split_train = {l:0 for l in labels} for l in labels: split_train[l] = sum(developement_df[l].values) split_test = {l:0 for l in labels} for l in labels: split_test[l] = sum(testing_df[l].values) n_samples_train = len(developement_df) n_samples_test = len(testing_df) # Create the appropriate statistics container for the whole experiment. training_stats = Statistics()
from __future__ import division import predict.plotting as plot import predict.preprocess as prep import numpy as np import sys if __name__ == "__main__": fp = open(sys.argv[1], 'r') header = fp.readline().split('\t') labels = [elem.lower() for elem in prep.get_labels_from_file('data/labels.tsv')] idx = [header.index(elem) for elem in header if elem.lower() in labels] idx_start = min(idx) idx_end = max(idx) over_p = [] ps = np.arange(0, 1.05, 0.05) probas = [] for line in fp: xs = line.strip().split('\t') nums = [float(x) for x in xs[idx_start: idx_end + 1]] probas.append(nums) n = len(probas) for p in ps: n_over_p = sum([max(ls) >= p for ls in probas]) over_p.append(n_over_p / n) for (x, y) in zip(ps, over_p): print('{}\t{}'.format(x, y)) plot.plot(