def filter_in_IEA(handle): outhandle = open(handle.name + ".IEA", "w") outhandle.write('!gaf-version: 2.0\n') for inrec in upg.gafiterator(handle): if inrec['Evidence'] == 'IEA': upg.writerec(inrec, outhandle) outhandle.close()
def extract_gaf(rec, outfile, GAFFIELDS, record, sp_id, taxon): t = () if sp_id.has_key(rec['DB_Object_ID']): GOAParser.writerec(rec, outfile, GAFFIELDS) if len(GAFFIELDS) == 15: t = (rec['DB'], rec['DB_Object_ID'], rec['DB_Object_Symbol'], ('|'.join(rec['Qualifier'])), rec['GO_ID'], ('|'.join(rec['DB:Reference'])), rec['Evidence'], ('|'.join(rec['With'])), rec['Aspect'], rec['DB_Object_Name'], ('|'.join(rec['Synonym'])), rec['DB_Object_Type'], ('|'.join(rec['Taxon_ID'])), rec['Date'], rec['Assigned_By']) elif len(GAFFIELDS) == 17: t = (rec['DB'], rec['DB_Object_ID'], rec['DB_Object_Symbol'], ('|'.join(rec['Qualifier'])), rec['GO_ID'], ('|'.join(rec['DB:Reference'])), rec['Evidence'], ('|'.join(rec['With'])), rec['Aspect'], ('|'.join(rec['DB_Object_Name'])), ('|'.join(rec['Synonym'])), rec['DB_Object_Type'], ('|'.join(rec['Taxon_ID'])), rec['Date'], rec['Assigned_By'], rec['Annotation_Extension'], rec['Gene_Product_Form_ID']) record.append(t) return record
def filter_in_experimental(handle): outhandle = open(handle.name + ".exp_evidence", "w") outhandle.write('!gaf-version: 2.0\n') for inrec in upg.gafiterator(handle): if upg.record_has(inrec, {'Evidence': GO_EXP_EC}): upg.writerec(inrec, outhandle) outhandle.close()
def all_hasnt_experimental(handle): outhandle = open(handle.name + ".noexp", "w") outhandle.write('!gaf-version: 2.0\n') for protrec in upg.gafbyproteiniterator(handle): if not has_experimental(protrec): for outrec in protrec: upg.writerec(outrec, outhandle) outhandle.close()
def all_exclusive_IEA(handle): outhandle = open(handle.name + ".exclusive_IEA", "w") outhandle.write('!gaf-version: 2.0\n') for protrec in upg.gafbyproteiniterator(handle): if exclusive_IEA(protrec): for outrec in protrec: upg.writerec(outrec, outhandle) outhandle.close()
def extract_taxon(handle, in_taxid): """ Create a GAF file from a single taxon """ header = "!gaf-version: 2.0\n" if isinstance(in_taxid, int): taxid = str(in_taxid) taxid = in_taxid.strip() outfile = open("%s.taxon.%s" % (handle.name, taxid), 'w') outfile.write(header) for inrec in upg.gafiterator(handle): if inrec['Taxon_ID'][0].split(':')[1] == taxid: upg.writerec(inrec, outfile) outfile.close()
def build_clusters(species): """ Build GO Clusters from a species gene association file. The cluster contains a representative GO term, the proteins annotated to this term and all the papers that those proteins appear in. @param unigoa_file: Uniprot_GOA association file in gaf format. """ go_clusters = {} pmid_go = {} pmid_prot = {} unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species)) for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] if inrec['GO_ID'] not in go_clusters: go_clusters[inrec['GO_ID']] = {'proteins':Set([inrec['DB_Object_ID']]), 'papers':Set([pmid])} else: go_clusters[inrec['GO_ID']]['proteins'].add(inrec['DB_Object_ID']) go_clusters[inrec['GO_ID']]['papers'].add(pmid) if pmid not in pmid_go: pmid_go[pmid] = Set([inrec['GO_ID']]) else: pmid_go[pmid].add(inrec['GO_ID']) if pmid not in pmid_prot: pmid_prot[pmid] = Set([inrec['DB_Object_ID']]) else: pmid_prot[pmid].add(inrec['DB_Object_ID']) pickle_data(go_clusters, os.path.join(CURR_PATH, "Pickled_Data/go_clusters_"+species)) pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/pmid_go_"+species)) pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/pmid_prot_"+species))
def pmids_from_gaf(gaf_file): """ Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained. @param unigoa_file: Uniprot_GOA association file in gaf format. """ pmid_go = {} unigoa_file = open(gaf_file) pmids = {} pmid_prot = {} for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] pmids[pmid] = None if pmid not in pmid_go: pmid_go[pmid] = [inrec['GO_ID']] elif inrec['GO_ID'] not in pmid_go[pmid]: pmid_go[pmid].append(inrec['GO_ID']) if pmid not in pmid_prot: pmid_prot[pmid] = [inrec['DB_Object_ID']] elif inrec['DB_Object_ID'] not in pmid_prot[pmid]: pmid_prot[pmid].append(inrec['DB_Object_ID']) return list(pmids.keys()), pmid_go, pmid_prot
def get_ebi(uri): ''' Fetches GOA file for a species from UniProt using Biopython Retrurns annotations ''' data_folder = os.getcwd() + '/data' fn = uri.split('/')[-1] # Check if the file exists already gaf = os.path.join(data_folder, fn) if (not os.path.isfile(gaf)): # Login to FTP server ebi_ftp = FTP('ftp.ebi.ac.uk') ebi_ftp.login() # Logs in anonymously # Download with open(gaf, 'wb') as fp: ebi_ftp.retrbinary(f'RETR {uri}', fp.write) # Logout from FTP server ebi_ftp.quit() # File is a gunzip file, so we need to open it in this way with gzip.open(gaf, 'rt') as gaf_fp: funcs = {} # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. for entry in GOA.gafiterator(gaf_fp): uniprot_id = entry.pop('DB_Object_ID') funcs[uniprot_id] = entry return funcs
def extract_taxa(handle, taxalist): """ Create a GAF file from multiple taxa taxalist is a list of strings of taxid. Don't use list of int """ outfiles = {} header = "!gaf-version: 2.0\n" for taxid in taxalist: outfiles[taxid] = open("%s.taxon.%s" % (handle.name, taxid), 'w') outfiles[taxid].write(header) for inrec in upg.gafiterator(handle): cur_taxid = inrec['Taxon_ID'][0].split(':')[1] if cur_taxid in taxalist: upg.writerec(inrec, outfiles[cur_taxid]) for i in outfiles: outfiles[i].close()
def go_enrichment_study(self): if self._go_enrichment_study is None: # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) self._go_enrichment_study = \ GOEnrichmentStudy(population, associations, self._gene_ontology, propagate_counts=True, alpha=0.01, methods=[self.method]) return self._go_enrichment_study
def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'): # Load the Gene Ontology n_comps = metagene_matrix.shape[1] self.download_and_cache_resources( ) # Download ontology and annotations, if necessary gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo') # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) gea = GOEnrichmentStudy(population, associations, gene_ontology, propagate_counts=True, alpha=0.05, methods=[method]) gea_results_by_component = {} rankings = self.ranked_genes_by_component(metagene_matrix) for ci in range(n_comps): study_genes = rankings[ci] print('\nComp. %d: %s...' % (ci, str(study_genes[:10]))) gea_results_by_component[ci] = gea.run_study(study_genes) # Get results into a dataframe per component. Easiest way is to use routine to # write a .tsv file, then read back and filter gea_results_df_by_component = [] for ci in range(n_comps): ge_df = self._perform_gene_enrichment_analysis_one_component( ci, gea_results_by_component, gea) if ge_df is not None: gea_results_df_by_component += [ge_df] # Merge the per-component dataframes into a single one gea_all_sig_results_df = pd.DataFrame() gea_all_sig_results_df = gea_all_sig_results_df.append( gea_results_df_by_component) gea_all_sig_results_df.to_csv(self.cache_dir + '%s_gea_all.tsv' % self.prefix, sep='\t')
def test_gpi_iterator(self): """Test GOA GPI file iterator, gpi-version: 1.1.""" recs = [] with open("UniProt/gp_information.goa_yeast.28.gpi") as handle: for rec in GOA.gpi_iterator(handle): recs.append(rec) self.assertEqual(len(recs), 300) self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI11FIELDS)) # Check values of first record self.assertEqual(recs[0]["DB_Object_ID"], "A2P2R3") self.assertEqual(recs[0]["DB_Object_Symbol"], "YMR084W") self.assertEqual( recs[0]["DB_Object_Name"], [ "Putative glutamine--fructose" "-6-phosphate aminotransferase" " [isomerizing]" ], ) self.assertEqual(recs[0]["DB_Object_Synonym"], ["YM084_YEAST", "YMR084W"]) self.assertEqual(recs[0]["DB_Object_Type"], "protein") self.assertEqual(recs[0]["Taxon"], "taxon:559292") self.assertEqual(recs[0]["Parent_Object_ID"], "") self.assertEqual(recs[0]["DB_Xref"], [""]) self.assertEqual(recs[0]["Gene_Product_Properties"], ["db_subset=Swiss-Prot"])
def has_experimental(goa_reclist): retval = False for rec in goa_reclist: if upg.record_has(rec, {'Evidence': GO_EXP_EC}): retval = True break return retval
def build_clusters(species): """ Build GO Clusters from a species gene association file. The cluster contains a representative GO term, the proteins annotated to this term and all the papers that those proteins appear in. @param unigoa_file: Uniprot_GOA association file in gaf format. """ pmid_go_mf = OrderedDict() pmid_go_bp = OrderedDict() pmid_go_cc = OrderedDict() pmid_go = OrderedDict() pmid_prot_mf = OrderedDict() pmid_prot_bp = OrderedDict() pmid_prot_cc = OrderedDict() pmid_prot = OrderedDict() go_prot_mf = OrderedDict() go_prot_bp = OrderedDict() go_prot_cc = OrderedDict() go_prot = OrderedDict() unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species)) for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] if inrec['Aspect'] == 'P': add_to_pmid(pmid_go_bp, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot_bp, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot_bp, inrec['GO_ID'], inrec['DB_Object_ID']) elif inrec['Aspect'] == 'F': add_to_pmid(pmid_go_mf, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot_mf, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot_mf, inrec['GO_ID'], inrec['DB_Object_ID']) elif inrec['Aspect'] == 'C': add_to_pmid(pmid_go_cc, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot_cc, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot_cc, inrec['GO_ID'], inrec['DB_Object_ID']) add_to_pmid(pmid_go, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot, inrec['GO_ID'], inrec['DB_Object_ID']) pickle_data(pmid_go_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_mf_"+species)) pickle_data(pmid_go_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_cc_"+species)) pickle_data(pmid_go_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_bp_"+species)) pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_"+species)) pickle_data(pmid_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_mf_"+species)) pickle_data(pmid_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_cc_"+species)) pickle_data(pmid_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_bp_"+species)) pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_"+species)) pickle_data(go_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_mf_"+species)) pickle_data(go_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_cc_"+species)) pickle_data(go_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_bp_"+species)) pickle_data(go_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_"+species))
def pmids_from_gaf(unigoa_file): pmids = {} for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] pmids[pmid] = None return list(pmids.keys()) # I enforced the list cast here because the dict_key is not subscriptable
def get_GO_genes_info(self): lines = [] with open(self.gene_ontology_file_path) as file: l = GOA.gafiterator(file) for line in l: lines.append(line) go_df = pd.DataFrame(lines) return go_df
def became_experimental(handle_iea, handle_exp): """Identify Electronically annotated proteins that became experimentally annotated later handle_iea: gaf file with proteins exclusively IEA annotated handle_exp: gaf file with proteins experimentally annotated Note: files should contain only one ontology (either MFO, BPO or CCO) to make sense. """ expdict = {} outhandle = open(handle_exp.name + ".became_exp", "w") outhandle.write('!gaf-version: 2.0\n') # First read experimental into memory for exprec in upg.gafbyproteiniterator(handle_exp): expdict[exprec[0]['DB_Object_ID']] = exprec # Now read in the non-experimental for iearec in upg.gafbyproteiniterator(handle_iea): prot_id = iearec[0]['DB_Object_ID'] if prot_id in expdict: upg.writebyproteinrec(expdict[prot_id], outhandle) outhandle.close()
def read_gaf_write_tab(gaf_file, include_mfo, outfile): Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])} if include_mfo: Aspect = {'Aspect':set(['P','F'])} else: Aspect = {'Aspect':set(['P'])} Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])} outhandle = open(outfile, 'w') ingafhandle = open(gaf_file,'r') counter = 0 for rec in GOA.gafiterator(ingafhandle): if GOA.record_has(rec, Aspect): if GOA.record_has(rec, Evidence): prot = rec['DB_Object_ID'] go = rec['GO_ID'] outhandle.write("%s\t%s\n" % (prot, go)) counter +=1 ingafhandle.close() outhandle.close() return(counter)
def test_gaf_iterator(self): """Test GOA GAF file iterator.""" # Test GAF 2.0 recs = [] with open("UniProt/goa_yeast.gaf") as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 587) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]["DB"], "UniProtKB") self.assertEqual(recs[0]["DB_Object_ID"], "A0A023PXA5") self.assertEqual(recs[0]["DB_Object_Symbol"], "YAL019W-A") self.assertEqual(recs[0]["Qualifier"], [""]) self.assertEqual(recs[0]["GO_ID"], "GO:0003674") self.assertEqual(recs[0]["DB:Reference"], ["GO_REF:0000015"]) self.assertEqual(recs[0]["Evidence"], "ND") self.assertEqual(recs[0]["With"], [""]) # Test GAF 2.1, it has the same fields as GAF 2.0 recs = [] with open("UniProt/gene_association.goa_yeast.1.gaf") as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 300) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]["DB"], "UniProtKB") self.assertEqual(recs[0]["DB_Object_ID"], "P17536") self.assertEqual(recs[0]["DB_Object_Symbol"], "TPM1") self.assertEqual(recs[0]["Qualifier"], [""]) self.assertEqual(recs[0]["GO_ID"], "GO:0000001") self.assertEqual(recs[0]["DB:Reference"], ["PMID:10652251"]) self.assertEqual(recs[0]["Evidence"], "TAS") self.assertEqual(recs[0]["With"], [""])
def test_gaf_iterator(self): """Test GOA GAF file iterator.""" # Test GAF 2.0 recs = [] with open('UniProt/goa_yeast.gaf', 'r') as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 587) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]['DB'], 'UniProtKB') self.assertEqual(recs[0]['DB_Object_ID'], 'A0A023PXA5') self.assertEqual(recs[0]['DB_Object_Symbol'], 'YAL019W-A') self.assertEqual(recs[0]['Qualifier'], ['']) self.assertEqual(recs[0]['GO_ID'], 'GO:0003674') self.assertEqual(recs[0]['DB:Reference'], ['GO_REF:0000015']) self.assertEqual(recs[0]['Evidence'], 'ND') self.assertEqual(recs[0]['With'], ['']) # Test GAF 2.1, it has the same fields as GAF 2.0 recs = [] with open('UniProt/gene_association.goa_yeast.1.gaf', 'r') as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 300) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]['DB'], 'UniProtKB') self.assertEqual(recs[0]['DB_Object_ID'], 'P17536') self.assertEqual(recs[0]['DB_Object_Symbol'], 'TPM1') self.assertEqual(recs[0]['Qualifier'], ['']) self.assertEqual(recs[0]['GO_ID'], 'GO:0000001') self.assertEqual(recs[0]['DB:Reference'], ['PMID:10652251']) self.assertEqual(recs[0]['Evidence'], 'TAS') self.assertEqual(recs[0]['With'], [''])
def load_gaf(filename, start=collection.count({})): # load GOA in a flat structure print("Loading %s" % filename) collection.create_index("DB_Object_ID") collection.create_index("DB") collection.create_index("GO_ID") collection.create_index("Evidence") collection.create_index("Aspect") collection.create_index("Date") collection.create_index("DB_Object_Symbol") with open(filename, 'r') as handler: goa_iterator = GOA.gafiterator(handler) for i, data in enumerate(goa_iterator): if i % 100 == 0: sys.stdout.write("\rProcessed annotations\t%s" % i) if i < start or (args.exp and data['Evidence'] not in exp_codes): continue date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date() json = { "DB_Object_ID": data['DB_Object_ID'], "DB_Object_Symbol": data['DB_Object_Symbol'], "With": data['With'], "Assigned_By": data['Assigned_By'], "Annotation_Extension": data['Annotation_Extension'], "Gene_Product_Form_ID": data['Gene_Product_Form_ID'], "DB:Reference": data['DB:Reference'], "GO_ID": data['GO_ID'], "Qualifier": data['Qualifier'], "Date": datetime.datetime.fromordinal(date.toordinal()), "DB": data['DB'], "created_at": datetime.datetime.utcnow(), "DB_Object_Name": data['DB_Object_Name'], "DB_Object_Type": data['DB_Object_Type'], "Evidence": data['Evidence'], "Taxon_ID": data['Taxon_ID'], "Aspect": data['Aspect'] } collection.update_one({"_id": i}, {'$set': json}, upsert=True) print("\nFinished!")
def get_pmids_from_gaf(gaf_file): """ Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained. @param unigoa_file: Uniprot_GOA association file in gaf format. """ unigoa_file = open(gaf_file) pmids = {} for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] pmids[pmid] = None return list(pmids.keys())
def load_entire_goa(src_gpa, start=db.goa_uniprot.count({})): # load GOA in a flat structure logger.info("Countig GeneOntology annotations ...") numannots = 0 with open(src_gpa, 'r') as handler: goa = GOA._gpa11iterator(handler) for _ in goa: numannots += 1 logger.info("\nLoading %s GO annotations to %s ...\n" % (numannots, dbname)) with open(src_gpa, 'r') as handler: goa = GOA._gpa11iterator(handler) for i in tqdm(range(numannots), desc="annotations already processed"): data = next(goa) date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date() if i < start: continue json = { "DB_Object_ID": data["DB_Object_ID"], "Annotation_Properties": data['Annotation_Properties'], "With": data['With'], "Interacting_taxon_ID": data['Interacting_taxon_ID'], "DB:Reference": data['DB:Reference'], "Annotation_Extension": data['Annotation Extension'], "Assigned_by": data['Assigned_by'], "GO_ID": data['GO_ID'], "ECO_Evidence_code": data['ECO_Evidence_code'], "Qualifier": data['Qualifier'], "Date": datetime.datetime.fromordinal(date.toordinal()), "DB": data['DB'], "created_at": datetime.datetime.utcnow() } db.goa_uniprot.update_one({ "_id": i}, { '$set': json }, upsert=True)
def test_selection_writing(self): """Test record_has, and writerec. Adapted from Bio.UniProt.GOA.py by Iddo Friedberg [email protected]. """ recs = [] filtered = [] # Fields to filter evidence = {"Evidence": {"ND"}} synonym = {"Synonym": {"YA19A_YEAST", "YAL019W-A"}} taxon_id = {"Taxon_ID": {"taxon:559292"}} # Temporal file to test writerec f_number, f_filtered = tempfile.mkstemp() os.close(f_number) # Open a file and select records as per filter with open("UniProt/goa_yeast.gaf", "r") as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Filtering if ( GOA.record_has(rec, taxon_id) and GOA.record_has(rec, evidence) and GOA.record_has(rec, synonym) ): filtered.append(rec) # Check number of filtered records self.assertEqual(len(filtered), 3) # Write the filtered records to a file using writerec with open(f_filtered, "w") as handle: # '!gaf-version: 2.1' handle.write("!gaf-version: 2.1 \n") # Adding file header for rec in filtered: GOA.writerec(rec, handle) # Open and read the file containing the filtered records recs_ff = [] # Records from filtered file with open(f_filtered, "r") as handle: for rec in GOA.gafiterator(handle): recs_ff.append(rec) # Delete test file os.remove(f_filtered) # Compare, recs saved by writerec and filtered recs self.assertEqual(filtered, recs_ff)
def main(): ''' Get a GAF, Convert, Calculate, and output. ''' # Use the assessment configuration file to grab the OBO file. obo_path, gaf_path = read_config() print "Read the config" ICHelper.setupGraphs(obo_path) print "Graphs are done" gaf = GOA._gaf20iterator(open(gaf_path)) print "I have made the GAF generator" data = GAFtoDICT(gaf) print "I have made the dictionary" WyattClarkIC(data) print("IC values created")
def load(self, filename: str, organism_name: str, annotation_level: str): """Import data from a GAF file into a Chado database""" # Load dependencies default_organism = self._load_organism(organism_name) features_with_product = set() # Loop over all records in the GAF file with open(filename) as f: for gaf_record in GOA.gafiterator(f): # Import this record into the database self._load_gaf_record(gaf_record, default_organism, annotation_level, features_with_product) # Commit changes self.session.commit()
def GOTermCounter(file,ontology,term): D={} gaf=goa.gafiterator(file) for entry in gaf: if(entry['GO_ID'] in root_terms or entry['Evidence'] not in EC): """print entry['GO_ID']""" else: if(entry['Aspect']==ontology and term=='GO term'): if(entry['GO_ID'] not in D.keys()): D[entry['GO_ID']]=1 else: D[entry['GO_ID']]+=1 if(entry['Aspect']==ontology and term=='PMID'): for refs in entry['DB:Reference']: if(re.match("PMID",refs)): if(refs not in D.keys()): D[refs]=1 else: D[refs]+=1 return D
def test_gpa_iterator(self): """Test GOA GPA file iterator.""" recs = [] with open("UniProt/goa_yeast.gpa.59.gpa") as handle: for rec in GOA.gpa_iterator(handle): recs.append(rec) self.assertEqual(len(recs), 300) self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPA11FIELDS)) # Check values of first record self.assertEqual(recs[0]["DB"], "UniProtKB") self.assertEqual(recs[0]["DB_Object_ID"], "A0A023PXA5") self.assertEqual(recs[0]["Qualifier"], ["enables"]) self.assertEqual(recs[0]["GO_ID"], "GO:0003674") self.assertEqual(recs[0]["DB:Reference"], ["GO_REF:0000015"]) self.assertEqual(recs[0]["ECO_Evidence_code"], "ECO:0000307") self.assertEqual(recs[0]["With"], [""]) self.assertEqual(recs[0]["Interacting_taxon_ID"], "") self.assertEqual(recs[0]["Date"], "20030730") self.assertEqual(recs[0]["Assigned_by"], "SGD") self.assertEqual(recs[0]["Annotation Extension"], [""]) self.assertEqual(recs[0]["Annotation_Properties"], "go_evidence=ND")
def test_gpi_iterator_one_two(self): """Test GOA GPI file iterator, gpi-version: 1.2.""" recs = [] with open("UniProt/goa_human_sample.gpi") as handle: for rec in GOA.gpi_iterator(handle): recs.append(rec) self.assertEqual(len(recs), 9) self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI12FIELDS)) # Check values of first record self.assertEqual(recs[0]["DB"], "UniProtKB") self.assertEqual(recs[0]["DB_Object_ID"], "A0A024R1R8") self.assertEqual(recs[0]["DB_Object_Symbol"], "hCG_2014768") self.assertEqual(recs[0]["DB_Object_Name"], ["HCG2014768, isoform CRA_a"]) self.assertEqual(recs[0]["DB_Object_Synonym"], ["hCG_2014768"]) self.assertEqual(recs[0]["DB_Object_Type"], "protein") self.assertEqual(recs[0]["Taxon"], "taxon:9606") self.assertEqual(recs[0]["Parent_Object_ID"], "") self.assertEqual(recs[0]["DB_Xref"], [""]) self.assertEqual(recs[0]["Gene_Product_Properties"], ["db_subset=TrEMBL"])
def test_gpa_iterator(self): """Test GOA GPA file iterator.""" recs = [] with open('UniProt/goa_yeast.gpa.59.gpa', 'r') as handle: for rec in GOA.gpa_iterator(handle): recs.append(rec) self.assertEqual(len(recs), 300) self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPA11FIELDS)) # Check values of first record self.assertEqual(recs[0]['DB'], 'UniProtKB') self.assertEqual(recs[0]['DB_Object_ID'], 'A0A023PXA5') self.assertEqual(recs[0]['Qualifier'], ['enables']) self.assertEqual(recs[0]['GO_ID'], 'GO:0003674') self.assertEqual(recs[0]['DB:Reference'], ['GO_REF:0000015']) self.assertEqual(recs[0]['ECO_Evidence_code'], 'ECO:0000307') self.assertEqual(recs[0]['With'], ['']) self.assertEqual(recs[0]['Interacting_taxon_ID'], '') self.assertEqual(recs[0]['Date'], '20030730') self.assertEqual(recs[0]['Assigned_by'], 'SGD') self.assertEqual(recs[0]['Annotation Extension'], ['']) self.assertEqual(recs[0]['Annotation_Properties'], 'go_evidence=ND')
def load_dataframe(self, file_resources, npartitions=None): go_annotation_dfs = [] for file in file_resources: if ".gaf" in file: go_lines = [] for line in GOA.gafiterator(file_resources[file]): go_lines.append(line) go_annotation_dfs.append(pd.DataFrame(go_lines)) go_annotations = pd.concat(go_annotation_dfs) go_terms = pd.DataFrame.from_dict(self.network.nodes, orient="index", dtype="object") go_annotations["go_name"] = go_annotations["GO_ID"].map( go_terms["name"]) go_annotations["namespace"] = go_annotations["GO_ID"].map( go_terms["namespace"]) go_annotations["is_a"] = go_annotations["GO_ID"].map(go_terms["is_a"]) return go_annotations
def parse_gpi(infile, taxon=''): sp_id = defaultdict() infile_handle = open(infile, 'r') parser = GOAParser.gpi_iterator(infile_handle) for rec in parser: print rec.keys() if not rec.has_key('Gene_Product_Properties'): print "This version of the gp information file does not contain all required information" sys.exit(1) else: break for rec in parser: taxid = rec['Taxon'].split(':')[1].strip() db = rec['Gene_Product_Properties'][0].split('=')[1].strip() if db.startswith('Swiss-Prot') and taxon == taxid: sp_id[rec['DB_Object_ID']] = 1 return sp_id
def split_to_ontologies(handle): """Splits a GAF file into three ontology files """ header = "!gaf-version: 2.0\n" out_mfo = open("%s.MFO" % handle.name, 'w') out_bpo = open("%s.BPO" % handle.name, 'w') out_cco = open("%s.CCO" % handle.name, 'w') out_bpo.write(header) out_mfo.write(header) out_cco.write(header) for inrec in upg.gafiterator(handle): if inrec['Aspect'] == 'F': upg.writerec(inrec, out_mfo) elif inrec['Aspect'] == 'P': upg.writerec(inrec, out_bpo) elif inrec['Aspect'] == 'C': upg.writerec(inrec, out_cco) else: raise ValueError, 'unknown ontology aspect %s' % inrec['Aspect'] out_mfo.close() out_bpo.close() out_cco.close()
def test_gpi_iterator(self): """Test GOA GPI file iterator.""" recs = [] with open('UniProt/gp_information.goa_yeast.28.gpi', 'r') as handle: for rec in GOA.gpi_iterator(handle): recs.append(rec) self.assertEqual(len(recs), 300) self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GPI11FIELDS)) # Check values of first record self.assertEqual(recs[0]['DB_Object_ID'], 'A2P2R3') self.assertEqual(recs[0]['DB_Object_Symbol'], 'YMR084W') self.assertEqual(recs[0]['DB_Object_Name'], [ 'Putative glutamine--fructose' '-6-phosphate aminotransferase' ' [isomerizing]' ]) self.assertEqual(recs[0]['DB_Object_Synonym'], ['YM084_YEAST', 'YMR084W']) self.assertEqual(recs[0]['DB_Object_Type'], 'protein') self.assertEqual(recs[0]['Taxon'], 'taxon:559292') self.assertEqual(recs[0]['Parent_Object_ID'], '') self.assertEqual(recs[0]['DB_Xref'], ['']) self.assertEqual(recs[0]['Gene_Product_Properties'], ['db_subset=Swiss-Prot'])
import Bio.UniProt.GOA as goa import sys import Bio.Entrez as ez """"" Retrieving protein references from the yeast association file in GAF 2.0 format according to different criteria """ """" Retrieve all references cited to annotate proteins with Experimental Evidence Codes """ handle = open("gene_association.goa_yeast") # open the association gene file of the yeast proteins = goa.gafiterator(handle) # read all records in the file Evidences = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"])} print ("GO-annotated proteins supported by Experimental Evidence Code") for protein in proteins: if goa.record_has(protein, Evidences): print(protein['DB:Reference']) """" Retrieve all references cited to annotate proteins with Experimental Evidence Codes in the Molecular Function aspect of GO """ handle = open("gene_association.goa_yeast") proteins = goa.gafiterator(handle) Evi_Aspect = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"]), "Aspect":set(["F"])} print ("GO-annotated proteins supported by Experimental Evidence Code in the Molecular Function Ontology") for protein in proteins: if goa.record_has(protein, Evi_Aspect):
def parseGAF(database='PDB', **kwargs): """Parse a GO Association File (GAF) corresponding to a particular database collection into a dictionary for ease of querying. See `GAF`_ for more information on the file format .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/ :arg database: name of the database of interest default is PDB. Others include UNIPROT and common names of many organisms. :type database: str :arg filename: filename for the gaf of interest default is goa_ and the database name in lower case and .gaf.gz :type filename: str """ import Bio.UniProt.GOA as GOA if not isinstance(database, str): raise TypeError('database should be a string') database = database.upper() filename = kwargs.get('filename', None) if filename is None: if database == 'UNIPROT': filename = 'goa_' + database.lower() + '_all.gaf.gz' else: filename = 'goa_' + database.lower() + '.gaf' data_folder = kwargs.get('data_folder', os.getcwd()) # If the file doesn't already exist, download it gaf = os.path.join(data_folder, filename) if not(os.path.exists(gaf) and os.path.getsize(gaf) > 0): LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf)) data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() try: ftp.cwd('pub/databases/GO/goa') ftp.cwd(database) ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write) except: raise ValueError('Cannot find the requested GO association file') # Logout from FTP server ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() with open(filename, 'w') as gaf_fp: gaf_fp.write(rawdata) LOGGER.info('Download completed for file {0}'.format(filename)) with open(filename, 'rt') as gaf_fp: funcs = defaultdict(list) # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. LOGGER.info('Iterating through entries in {0}'.format(gaf)) for entry in GOA.gafiterator(gaf_fp): id = entry.pop('DB_Object_ID') funcs[id].append(entry) return funcs
""" Returns the pmids of the papers this paper cites """ cites_list = [] handle = ez.efetch("pubmed", id=pmid, retmode="xml") pubmed_rec = ez.parse(handle).__next__() for ref in pubmed_rec['MedlineCitation']['CommentsCorrectionsList']: if ref.attributes['RefType'] == 'Cites': cites_list.append(str(ref['PMID'])) return cites_list f = open ("papers and citations.txt","w") st = "GO-annotated proteins supported by IGI evidence (Inferred from Genetic Interaction)\n" handle = open("gene_association.goa_yeast") proteins = goa.gafiterator(handle) Evi_Aspect = {"Evidence":set(["IGI"])} for protein in proteins: if goa.record_has(protein, Evi_Aspect): for p in protein['DB:Reference']: if p[:4] == "PMID": st += "Main PubMed reference: "+ p +"\n" citations = get_citations(p[5:]) for cit in citations: st += cit + " " st += "\n" f.write(st) f.close()