unique_accessions.add(p1) unique_accessions.add(p2) for (p1, p2), _ in test.iteritems(): unique_accessions.add(p1) unique_accessions.add(p2) fp = open(UNIQUE_ACCESSION_FILE, 'w+') for p in unique_accessions: fp.write("{}\n".format(p)) fp.close() # Construct uniprot handler to get data: if update or (not os.path.isfile(ACCESSION_FEATURES_FILE)): print("Building accession to feature map...") uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE, organism='h**o sapien') uniprot_features = uniprot.get_batch_accession_data( unique_accessions, [UniProt.GO, UniProt.IPR, UniProt.PFAM, UniProt.GO_EVD] ) # Write features to file fp = open(ACCESSION_FEATURES_FILE, 'w') fp.write('uniprot\tgo\tipr\tpfam\tgoe\n') for p, data in uniprot_features.iteritems(): fp.write('{0}\t{1}\t{2}\t{3}\n'.format( p, ','.join(data[UniProt.GO]), ','.join(data[UniProt.IPR]), ','.join(data[UniProt.PFAM]), ','.join(data[UniProt.GO_EVD]) )
df=dataframe, vectorizer=vectorizer, selector=selector, feature_col=x, label_col=y, continuous_cols=None ) return estimator.predict_proba(x_numpy) if __name__ == "__main__": method = sys.argv[1] outfile = sys.argv[2] columns = ['go_cc', 'go_mf', 'go_bp'] labels = get_labels_from_file('data/labels.tsv') uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE) dag = load_go_dag('data/gene_ontology.1_2.obo') # Load the training data. train, test, interactome_df = prep_data_frames(selection=columns, load_interactome=True) training_df = pd.concat([train, test], ignore_index=True) training_corpus_pf = compute_corpus(training_df, ['pfam']) training_corpus_ipr = compute_corpus(training_df, ['ipr']) training_corpus_bp = compute_corpus(training_df, ['induced_go_bp']) training_corpus_cc = compute_corpus(training_df, ['induced_go_cc']) training_corpus_mf = compute_corpus(training_df, ['induced_go_mf']) pina_corpus_pf = compute_corpus(interactome_df, ['pfam']) pina_corpus_ipr = compute_corpus(interactome_df, ['ipr']) pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp'])