Example #1
0
        unique_accessions.add(p1)
        unique_accessions.add(p2)

    for (p1, p2), _ in test.iteritems():
        unique_accessions.add(p1)
        unique_accessions.add(p2)

    fp = open(UNIQUE_ACCESSION_FILE, 'w+')
    for p in unique_accessions:
        fp.write("{}\n".format(p))
    fp.close()

    # Construct uniprot handler to get data:
    if update or (not os.path.isfile(ACCESSION_FEATURES_FILE)):
        print("Building accession to feature map...")
        uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE, organism='h**o sapien')
        uniprot_features = uniprot.get_batch_accession_data(
            unique_accessions, [UniProt.GO, UniProt.IPR, UniProt.PFAM, UniProt.GO_EVD]
        )

        # Write features to file
        fp = open(ACCESSION_FEATURES_FILE, 'w')
        fp.write('uniprot\tgo\tipr\tpfam\tgoe\n')
        for p, data in uniprot_features.iteritems():
            fp.write('{0}\t{1}\t{2}\t{3}\n'.format(
                    p,
                    ','.join(data[UniProt.GO]),
                    ','.join(data[UniProt.IPR]),
                    ','.join(data[UniProt.PFAM]),
                    ','.join(data[UniProt.GO_EVD])
                )
        df=dataframe,
        vectorizer=vectorizer,
        selector=selector,
        feature_col=x,
        label_col=y,
        continuous_cols=None
    )
    return estimator.predict_proba(x_numpy)


if __name__ == "__main__":
    method = sys.argv[1]
    outfile = sys.argv[2]
    columns = ['go_cc', 'go_mf', 'go_bp']
    labels = get_labels_from_file('data/labels.tsv')
    uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE)
    dag = load_go_dag('data/gene_ontology.1_2.obo')

    # Load the training data.
    train, test, interactome_df = prep_data_frames(selection=columns, load_interactome=True)
    training_df = pd.concat([train, test], ignore_index=True)

    training_corpus_pf = compute_corpus(training_df, ['pfam'])
    training_corpus_ipr = compute_corpus(training_df, ['ipr'])
    training_corpus_bp = compute_corpus(training_df, ['induced_go_bp'])
    training_corpus_cc = compute_corpus(training_df, ['induced_go_cc'])
    training_corpus_mf = compute_corpus(training_df, ['induced_go_mf'])

    pina_corpus_pf = compute_corpus(interactome_df, ['pfam'])
    pina_corpus_ipr = compute_corpus(interactome_df, ['ipr'])
    pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp'])