Esempio n. 1
0
            y = np.vstack(zip(*label_y_train))
            y_prob = np.vstack(zip(*label_pred_proba_train))
            training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_valid))
            y_prob = np.vstack(zip(*label_pred_proba_valid))
            validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_test))
            y_prob = np.vstack(zip(*label_pred_proba_test))
            testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            return training_stats_i_f, validation_stats_i_f, testing_stats_i_f

        # For each iteration, batch the folds into parallel jobs
        statistics_objects_i = parallel_map(do_fold, range(cv_folds), n_jobs)
        for (train, val, test) in statistics_objects_i:
            training_stats_i.merge(train)
            validation_stats_i.merge(val)
            testing_stats_i.merge(test)

        log.write('Iteration {}\n'.format(i))
        log.write('Training {}\n'.format(i))
        training_stats_i.write(log, 'a')
        log.write('Validation {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        log.write('Testing {}\n'.format(i))
        testing_stats_i.write(log, 'a')

        statistics_objects.append([training_stats_i, validation_stats_i, testing_stats_i])
        # return training_stats_i, validation_stats_i, testing_stats_i
Esempio n. 2
0
                    p,
                    ','.join(data[UniProt.GO]),
                    ','.join(data[UniProt.IPR]),
                    ','.join(data[UniProt.PFAM]),
                    ','.join(data[UniProt.GO_EVD])
                )
            )
        fp.close()

    # ----------------------------- RUN SEMANTIC SIM IF NEEDED ---------------------------- #
    if update or (not os.path.isfile(SIMILARITY_MAP)):
        print("Computing semantic similarities...")
        all_container = interactome.merge(train).merge(test)
        all_ppis = all_container.remove_symmetric().get_ppis()
        chunk_size = len(all_ppis) // n_jobs
        sem_sim_tuples = parallel_map(compute_ss, chunks(all_ppis, chunk_size), n_jobs=n_jobs)
        sem_sim_tuples = reduce(lambda x, y: x + y, sem_sim_tuples)
        semantic_sim_map = {}
        fp = open(SIMILARITY_MAP, 'w')
        fp.write("uniprot_a\tuniprot_b\tcc_ss\tbp_ss\tmf_ss\n")
        for p1, p2, cc_ss, bp_ss, mf_ss in sem_sim_tuples:
            assert (p1, p2) not in semantic_sim_map
            assert (p2, p1) not in semantic_sim_map
            semantic_sim_map[(p1, p2)] = np.asarray([cc_ss, bp_ss, mf_ss])
            fp.write('{}\t{}\t{}\t{}\t{}\n'.format(p1, p2, cc_ss, bp_ss, mf_ss))
        fp.close()
    else:
        print("Loading semantic similarities...")
        fp = open(SIMILARITY_MAP, 'r')
        fp.readline()
        semantic_sim_map = {}
Esempio n. 3
0
                )

                testing_stats_i_j = evaluate_model(
                    y=y_hprd,
                    y_pred=y_pred_hprd,
                    y_pred_prob=y_proba_hprd,
                    label=l,
                    statistics=testing_stats_i_j,
                    verbose=0
                )
            validation_stats_i.merge(validation_stats_i_j)
            testing_stats_i.merge(testing_stats_i_j)

        return validation_stats_i, testing_stats_i

    containers = parallel_map(do_iteration, zip(range(iterations), seeds), n_jobs=n_jobs)
    valid_containers = [containers[i][0] for i in range(iterations)]
    test_containers = [containers[i][1] for i in range(iterations)]

    for container in valid_containers:
        validation_stats.merge(container)

    for container in test_containers:
        testing_stats.merge(container)

    # --------------------- FINAL RESULTS ---------------------------- #
    direc = tempfile.mkdtemp(prefix='LLDA-{}-'.format(date), dir='results/')
    pickle.dump((validation_stats, testing_stats, config), open(direc + '/LLDA-statistics.pkl', 'w'))
    results = open(direc + '/LLDA-results.txt', 'w')

    results.write("\nRun Settings: \n")
Esempio n. 4
0
                    stop_words=['go', '', ' '], binary=binary, lowercase=True
                )
                vectorizer.fit(training_fold['terms'].values)

            # Fit an evaluate the performance of the classifier.
            x_train = vectorizer.transform(training_fold['terms'].values)
            y_train = np.asarray(training_fold[label].values, dtype=int)

            x_valid = vectorizer.transform(validation_fold['terms'].values)
            y_valid = np.asarray(validation_fold[label].values, dtype=int)

            estimator.fit(x_train, y_train)

            for t in thresholds:
                y_pred = [int(p[1] >= t) for p in estimator.predict_proba(x_valid)]
                precision = precision_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                recall = recall_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                f1 = f1_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                statistics_l.update_statistics(label=t, s_type='Precision', data=precision)
                statistics_l.update_statistics(label=t, s_type='Recall', data=recall)
                statistics_l.update_statistics(label=t, s_type='F1-Score', data=f1)

        statistics_l.frame()['reaction'] = label
        return statistics_l

    containers = parallel_map(pr_curve, range(len(labels)), n_jobs=n_jobs)
    dataframe = pd.concat([c.frame() for c in containers], ignore_index=True)
    print(dataframe, len(dataframe))
    dataframe.to_csv('analysis/pr_rc_curves.tsv', sep='\t', index=False)

Esempio n. 5
0
def build_data_frame(ppi_file, obo_file, accession_to_feature_file, induce, fill_na, cache, n_jobs):
    """
    Loads each tsv file containing a feature set (such as float similarity scores or accessions) into a pandas
    dataframe and then attempts to create binary vector/bag of words representations of textual accesion
    features. Finally combines each binary/numerical vector into a single feature vector along with it's label.

    @param ppi_file: Directory to look for feature files.
    @param obo_file: Path to obo file.
    @param accession_to_feature_file: Path to accession-feature map stored in tsv.
    @param induce: True to induce GO terms.
    @param fill_na: Value to fill NA with. Best to use np.NaN.
    @param cache: File to save dataframe to.

    @return: DataFrame.
    """
    print("Building dataframe...")

    dag = ontology.load_go_dag(obo_file)

    # Create the blank dataframe to which we will attach our data to.
    labels = get_labels_from_file('data/labels.tsv')

    od = Od({
        'uniprot': [],
        'uniprot_a': [],
        'uniprot_b': [],
        'go': [],
        'go_cc': [],
        'go_bp': [],
        'go_mf': [],
        'induced_go': [],
        'induced_go_cc': [],
        'induced_go_bp': [],
        'induced_go_mf': [],
        'ipr': [],
        'pfam': [],
        'sim': [],
        'label': []
    })
    columns = od.keys() + labels

    # This will be for quick accessing of data binary labels for BR methods.
    # initialise these label to a null value.
    for l in labels:
        od[l] = -1

    # Iterate through each ppi in the supplied file.
    fp = open(ppi_file, 'r')
    fp.readline() # assumes header exists for internal format

    def do_line(line):
        xs = line.strip().split('\t')
        p1 = xs[0].strip()
        p2 = xs[1].strip()
        reaction_type = xs[2].strip()
        reaction_types = [x.lower() for x in reaction_type.split(',')]

        # Maybe use resnik or something.
        cc_ss = float( xs[3].strip() )
        bp_ss = float( xs[4].strip() )
        mf_ss = float( xs[5].strip() )

        terms = compute_features([p1, p2], induce, accession_to_feature_file, fill_na, dag)

        od = Od({
            'uniprot': [(p1, p2)],
            'uniprot_a': [p1],
            'uniprot_b': [p2],
            'go': [terms['go']],
            'go_cc': [terms['go_cc']],
            'go_bp': [terms['go_bp']],
            'go_mf': [terms['go_mf']],
            'induced_go': [terms['induced_go']],
            'induced_go_cc': [terms['induced_go_cc']],
            'induced_go_bp': [terms['induced_go_bp']],
            'induced_go_mf': [terms['induced_go_mf']],
            'ipr': [terms['ipr']],
            'pfam': [terms['pfam']],
            'sim': [csr_matrix([cc_ss, bp_ss, mf_ss])],
            'label': [reaction_type]
        })

        # Iterate and check which labels are present in reaction_type.
        # Order of traversal is important here.
        for l in labels:
            if l.lower() in reaction_types:
                od[l] = 1
            else:
                od[l] = 0

        # Concatenate the dataframes
        df_new = pd.DataFrame(
            od, dtype='object',
            columns=columns
        )
        return df_new

    try:
         df_rows = parallel_map(do_line, fp, n_jobs=n_jobs)
    except KeyboardInterrupt:
        sys.exit(0)

    df = pd.concat(df_rows, ignore_index=True)
    df = df.reset_index(); del df['index']
    pickle.dump(df, open(cache, 'w'))
    return df
Esempio n. 6
0
                df_valid=validation_fold,
                df_test=test_df,
                binary=binary,
                connectivity='full',
                vectorizer_method=vectorizer_method
            )

            validation_stats_i.merge(stats_valid)
            testing_stats_i.merge(stats_test)

        log.write('Iteration {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        testing_stats_i.write(log, 'a')
        return validation_stats_i, testing_stats_i

    containers = parallel_map(do_iteration, range(iterations), n_jobs=n_jobs)
    valid_containers = [containers[i][0] for i in range(iterations)]
    test_containers = [containers[i][1] for i in range(iterations)]

    for container in valid_containers:
        validation_stats.merge(container)

    for container in test_containers:
        testing_stats.merge(container)

    # --------------------- FINAL RESULTS ---------------------------- #
    direc = tempfile.mkdtemp(prefix='{}-{}-'.format('CRF', date), dir='results/')
    pickle.dump((validation_stats, testing_stats, config), open(direc + '/{}-statistics.pkl'.format('CRF'), 'w'))
    results = open(direc + '/{}-results.txt'.format('CRF'), 'w')

    results.write("\nRun Settings: \n")