Beispiel #1
0
    for (p1, p2), _ in test.iteritems():
        unique_accessions.add(p1)
        unique_accessions.add(p2)

    fp = open(UNIQUE_ACCESSION_FILE, 'w+')
    for p in unique_accessions:
        fp.write("{}\n".format(p))
    fp.close()

    # Construct uniprot handler to get data:
    if update or (not os.path.isfile(ACCESSION_FEATURES_FILE)):
        print("Building accession to feature map...")
        uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE, organism='h**o sapien')
        uniprot_features = uniprot.get_batch_accession_data(
            unique_accessions, [UniProt.GO, UniProt.IPR, UniProt.PFAM, UniProt.GO_EVD]
        )

        # Write features to file
        fp = open(ACCESSION_FEATURES_FILE, 'w')
        fp.write('uniprot\tgo\tipr\tpfam\tgoe\n')
        for p, data in uniprot_features.iteritems():
            fp.write('{0}\t{1}\t{2}\t{3}\n'.format(
                    p,
                    ','.join(data[UniProt.GO]),
                    ','.join(data[UniProt.IPR]),
                    ','.join(data[UniProt.PFAM]),
                    ','.join(data[UniProt.GO_EVD])
                )
            )
        fp.close()
    training_corpus_bp = compute_corpus(training_df, ['induced_go_bp'])
    training_corpus_cc = compute_corpus(training_df, ['induced_go_cc'])
    training_corpus_mf = compute_corpus(training_df, ['induced_go_mf'])

    pina_corpus_pf = compute_corpus(interactome_df, ['pfam'])
    pina_corpus_ipr = compute_corpus(interactome_df, ['ipr'])
    pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp'])
    pina_corpus_cc = compute_corpus(interactome_df, ['induced_go_cc'])
    pina_corpus_mf = compute_corpus(interactome_df, ['induced_go_mf'])

    mean, std = depths(interactome_df, 'terms')
    interactome_df['depth_mu'] = mean
    interactome_df['depth_std'] = std

    accessions = list(set(list(interactome_df.uniprot_a.values) + list(interactome_df.uniprot_b.values)))
    gene_keys = uniprot.get_batch_accession_data(accessions, data_types=[UniProt.GENE])
    gene_keys = {k: v[uniprot.GENE].split(' ')[0] for k,v in gene_keys.iteritems()}

    predictions = {}
    vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True)
    vectorizer.fit(training_df['terms'].values)
    selectors = generate_selectors(columns, vectorizer.get_feature_names(), dag)
    estimators = make_classifiers(method, 'balanced', labels, selectors, columns, None)
    best_features = pd.DataFrame(data={'label': [], 'feature': []})

    def rename(x):
        if 'pf' not in x and 'ipr' not in x:
            return 'GO:' + x.upper()
        else:
            return x.upper()