for (p1, p2), _ in test.iteritems(): unique_accessions.add(p1) unique_accessions.add(p2) fp = open(UNIQUE_ACCESSION_FILE, 'w+') for p in unique_accessions: fp.write("{}\n".format(p)) fp.close() # Construct uniprot handler to get data: if update or (not os.path.isfile(ACCESSION_FEATURES_FILE)): print("Building accession to feature map...") uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE, organism='h**o sapien') uniprot_features = uniprot.get_batch_accession_data( unique_accessions, [UniProt.GO, UniProt.IPR, UniProt.PFAM, UniProt.GO_EVD] ) # Write features to file fp = open(ACCESSION_FEATURES_FILE, 'w') fp.write('uniprot\tgo\tipr\tpfam\tgoe\n') for p, data in uniprot_features.iteritems(): fp.write('{0}\t{1}\t{2}\t{3}\n'.format( p, ','.join(data[UniProt.GO]), ','.join(data[UniProt.IPR]), ','.join(data[UniProt.PFAM]), ','.join(data[UniProt.GO_EVD]) ) ) fp.close()
training_corpus_bp = compute_corpus(training_df, ['induced_go_bp']) training_corpus_cc = compute_corpus(training_df, ['induced_go_cc']) training_corpus_mf = compute_corpus(training_df, ['induced_go_mf']) pina_corpus_pf = compute_corpus(interactome_df, ['pfam']) pina_corpus_ipr = compute_corpus(interactome_df, ['ipr']) pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp']) pina_corpus_cc = compute_corpus(interactome_df, ['induced_go_cc']) pina_corpus_mf = compute_corpus(interactome_df, ['induced_go_mf']) mean, std = depths(interactome_df, 'terms') interactome_df['depth_mu'] = mean interactome_df['depth_std'] = std accessions = list(set(list(interactome_df.uniprot_a.values) + list(interactome_df.uniprot_b.values))) gene_keys = uniprot.get_batch_accession_data(accessions, data_types=[UniProt.GENE]) gene_keys = {k: v[uniprot.GENE].split(' ')[0] for k,v in gene_keys.iteritems()} predictions = {} vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True) vectorizer.fit(training_df['terms'].values) selectors = generate_selectors(columns, vectorizer.get_feature_names(), dag) estimators = make_classifiers(method, 'balanced', labels, selectors, columns, None) best_features = pd.DataFrame(data={'label': [], 'feature': []}) def rename(x): if 'pf' not in x and 'ipr' not in x: return 'GO:' + x.upper() else: return x.upper()