y = np.vstack(zip(*label_y_train)) y_prob = np.vstack(zip(*label_pred_proba_train)) training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_valid)) y_prob = np.vstack(zip(*label_pred_proba_valid)) validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_test)) y_prob = np.vstack(zip(*label_pred_proba_test)) testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) return training_stats_i_f, validation_stats_i_f, testing_stats_i_f # For each iteration, batch the folds into parallel jobs statistics_objects_i = parallel_map(do_fold, range(cv_folds), n_jobs) for (train, val, test) in statistics_objects_i: training_stats_i.merge(train) validation_stats_i.merge(val) testing_stats_i.merge(test) log.write('Iteration {}\n'.format(i)) log.write('Training {}\n'.format(i)) training_stats_i.write(log, 'a') log.write('Validation {}\n'.format(i)) validation_stats_i.write(log, 'a') log.write('Testing {}\n'.format(i)) testing_stats_i.write(log, 'a') statistics_objects.append([training_stats_i, validation_stats_i, testing_stats_i]) # return training_stats_i, validation_stats_i, testing_stats_i
p, ','.join(data[UniProt.GO]), ','.join(data[UniProt.IPR]), ','.join(data[UniProt.PFAM]), ','.join(data[UniProt.GO_EVD]) ) ) fp.close() # ----------------------------- RUN SEMANTIC SIM IF NEEDED ---------------------------- # if update or (not os.path.isfile(SIMILARITY_MAP)): print("Computing semantic similarities...") all_container = interactome.merge(train).merge(test) all_ppis = all_container.remove_symmetric().get_ppis() chunk_size = len(all_ppis) // n_jobs sem_sim_tuples = parallel_map(compute_ss, chunks(all_ppis, chunk_size), n_jobs=n_jobs) sem_sim_tuples = reduce(lambda x, y: x + y, sem_sim_tuples) semantic_sim_map = {} fp = open(SIMILARITY_MAP, 'w') fp.write("uniprot_a\tuniprot_b\tcc_ss\tbp_ss\tmf_ss\n") for p1, p2, cc_ss, bp_ss, mf_ss in sem_sim_tuples: assert (p1, p2) not in semantic_sim_map assert (p2, p1) not in semantic_sim_map semantic_sim_map[(p1, p2)] = np.asarray([cc_ss, bp_ss, mf_ss]) fp.write('{}\t{}\t{}\t{}\t{}\n'.format(p1, p2, cc_ss, bp_ss, mf_ss)) fp.close() else: print("Loading semantic similarities...") fp = open(SIMILARITY_MAP, 'r') fp.readline() semantic_sim_map = {}
) testing_stats_i_j = evaluate_model( y=y_hprd, y_pred=y_pred_hprd, y_pred_prob=y_proba_hprd, label=l, statistics=testing_stats_i_j, verbose=0 ) validation_stats_i.merge(validation_stats_i_j) testing_stats_i.merge(testing_stats_i_j) return validation_stats_i, testing_stats_i containers = parallel_map(do_iteration, zip(range(iterations), seeds), n_jobs=n_jobs) valid_containers = [containers[i][0] for i in range(iterations)] test_containers = [containers[i][1] for i in range(iterations)] for container in valid_containers: validation_stats.merge(container) for container in test_containers: testing_stats.merge(container) # --------------------- FINAL RESULTS ---------------------------- # direc = tempfile.mkdtemp(prefix='LLDA-{}-'.format(date), dir='results/') pickle.dump((validation_stats, testing_stats, config), open(direc + '/LLDA-statistics.pkl', 'w')) results = open(direc + '/LLDA-results.txt', 'w') results.write("\nRun Settings: \n")
stop_words=['go', '', ' '], binary=binary, lowercase=True ) vectorizer.fit(training_fold['terms'].values) # Fit an evaluate the performance of the classifier. x_train = vectorizer.transform(training_fold['terms'].values) y_train = np.asarray(training_fold[label].values, dtype=int) x_valid = vectorizer.transform(validation_fold['terms'].values) y_valid = np.asarray(validation_fold[label].values, dtype=int) estimator.fit(x_train, y_train) for t in thresholds: y_pred = [int(p[1] >= t) for p in estimator.predict_proba(x_valid)] precision = precision_score(y_valid, y_pred, labels=[0, 1], pos_label=1) recall = recall_score(y_valid, y_pred, labels=[0, 1], pos_label=1) f1 = f1_score(y_valid, y_pred, labels=[0, 1], pos_label=1) statistics_l.update_statistics(label=t, s_type='Precision', data=precision) statistics_l.update_statistics(label=t, s_type='Recall', data=recall) statistics_l.update_statistics(label=t, s_type='F1-Score', data=f1) statistics_l.frame()['reaction'] = label return statistics_l containers = parallel_map(pr_curve, range(len(labels)), n_jobs=n_jobs) dataframe = pd.concat([c.frame() for c in containers], ignore_index=True) print(dataframe, len(dataframe)) dataframe.to_csv('analysis/pr_rc_curves.tsv', sep='\t', index=False)
def build_data_frame(ppi_file, obo_file, accession_to_feature_file, induce, fill_na, cache, n_jobs): """ Loads each tsv file containing a feature set (such as float similarity scores or accessions) into a pandas dataframe and then attempts to create binary vector/bag of words representations of textual accesion features. Finally combines each binary/numerical vector into a single feature vector along with it's label. @param ppi_file: Directory to look for feature files. @param obo_file: Path to obo file. @param accession_to_feature_file: Path to accession-feature map stored in tsv. @param induce: True to induce GO terms. @param fill_na: Value to fill NA with. Best to use np.NaN. @param cache: File to save dataframe to. @return: DataFrame. """ print("Building dataframe...") dag = ontology.load_go_dag(obo_file) # Create the blank dataframe to which we will attach our data to. labels = get_labels_from_file('data/labels.tsv') od = Od({ 'uniprot': [], 'uniprot_a': [], 'uniprot_b': [], 'go': [], 'go_cc': [], 'go_bp': [], 'go_mf': [], 'induced_go': [], 'induced_go_cc': [], 'induced_go_bp': [], 'induced_go_mf': [], 'ipr': [], 'pfam': [], 'sim': [], 'label': [] }) columns = od.keys() + labels # This will be for quick accessing of data binary labels for BR methods. # initialise these label to a null value. for l in labels: od[l] = -1 # Iterate through each ppi in the supplied file. fp = open(ppi_file, 'r') fp.readline() # assumes header exists for internal format def do_line(line): xs = line.strip().split('\t') p1 = xs[0].strip() p2 = xs[1].strip() reaction_type = xs[2].strip() reaction_types = [x.lower() for x in reaction_type.split(',')] # Maybe use resnik or something. cc_ss = float( xs[3].strip() ) bp_ss = float( xs[4].strip() ) mf_ss = float( xs[5].strip() ) terms = compute_features([p1, p2], induce, accession_to_feature_file, fill_na, dag) od = Od({ 'uniprot': [(p1, p2)], 'uniprot_a': [p1], 'uniprot_b': [p2], 'go': [terms['go']], 'go_cc': [terms['go_cc']], 'go_bp': [terms['go_bp']], 'go_mf': [terms['go_mf']], 'induced_go': [terms['induced_go']], 'induced_go_cc': [terms['induced_go_cc']], 'induced_go_bp': [terms['induced_go_bp']], 'induced_go_mf': [terms['induced_go_mf']], 'ipr': [terms['ipr']], 'pfam': [terms['pfam']], 'sim': [csr_matrix([cc_ss, bp_ss, mf_ss])], 'label': [reaction_type] }) # Iterate and check which labels are present in reaction_type. # Order of traversal is important here. for l in labels: if l.lower() in reaction_types: od[l] = 1 else: od[l] = 0 # Concatenate the dataframes df_new = pd.DataFrame( od, dtype='object', columns=columns ) return df_new try: df_rows = parallel_map(do_line, fp, n_jobs=n_jobs) except KeyboardInterrupt: sys.exit(0) df = pd.concat(df_rows, ignore_index=True) df = df.reset_index(); del df['index'] pickle.dump(df, open(cache, 'w')) return df
df_valid=validation_fold, df_test=test_df, binary=binary, connectivity='full', vectorizer_method=vectorizer_method ) validation_stats_i.merge(stats_valid) testing_stats_i.merge(stats_test) log.write('Iteration {}\n'.format(i)) validation_stats_i.write(log, 'a') testing_stats_i.write(log, 'a') return validation_stats_i, testing_stats_i containers = parallel_map(do_iteration, range(iterations), n_jobs=n_jobs) valid_containers = [containers[i][0] for i in range(iterations)] test_containers = [containers[i][1] for i in range(iterations)] for container in valid_containers: validation_stats.merge(container) for container in test_containers: testing_stats.merge(container) # --------------------- FINAL RESULTS ---------------------------- # direc = tempfile.mkdtemp(prefix='{}-{}-'.format('CRF', date), dir='results/') pickle.dump((validation_stats, testing_stats, config), open(direc + '/{}-statistics.pkl'.format('CRF'), 'w')) results = open(direc + '/{}-results.txt'.format('CRF'), 'w') results.write("\nRun Settings: \n")