def test_selection_writing(self): """Test record_has, and writerec. Adapted from Bio.UniProt.GOA.py by Iddo Friedberg [email protected]. """ recs = [] filtered = [] # Fields to filter evidence = {"Evidence": {"ND"}} synonym = {"Synonym": {"YA19A_YEAST", "YAL019W-A"}} taxon_id = {"Taxon_ID": {"taxon:559292"}} # Temporal file to test writerec f_number, f_filtered = tempfile.mkstemp() os.close(f_number) # Open a file and select records as per filter with open("UniProt/goa_yeast.gaf", "r") as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Filtering if ( GOA.record_has(rec, taxon_id) and GOA.record_has(rec, evidence) and GOA.record_has(rec, synonym) ): filtered.append(rec) # Check number of filtered records self.assertEqual(len(filtered), 3) # Write the filtered records to a file using writerec with open(f_filtered, "w") as handle: # '!gaf-version: 2.1' handle.write("!gaf-version: 2.1 \n") # Adding file header for rec in filtered: GOA.writerec(rec, handle) # Open and read the file containing the filtered records recs_ff = [] # Records from filtered file with open(f_filtered, "r") as handle: for rec in GOA.gafiterator(handle): recs_ff.append(rec) # Delete test file os.remove(f_filtered) # Compare, recs saved by writerec and filtered recs self.assertEqual(filtered, recs_ff)
def filter_in_experimental(handle): outhandle = open(handle.name + ".exp_evidence", "w") outhandle.write('!gaf-version: 2.0\n') for inrec in upg.gafiterator(handle): if upg.record_has(inrec, {'Evidence': GO_EXP_EC}): upg.writerec(inrec, outhandle) outhandle.close()
def filter_in_IEA(handle): outhandle = open(handle.name + ".IEA", "w") outhandle.write('!gaf-version: 2.0\n') for inrec in upg.gafiterator(handle): if inrec['Evidence'] == 'IEA': upg.writerec(inrec, outhandle) outhandle.close()
def go_enrichment_study(self): if self._go_enrichment_study is None: # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) self._go_enrichment_study = \ GOEnrichmentStudy(population, associations, self._gene_ontology, propagate_counts=True, alpha=0.01, methods=[self.method]) return self._go_enrichment_study
def pmids_from_gaf(gaf_file): """ Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained. @param unigoa_file: Uniprot_GOA association file in gaf format. """ pmid_go = {} unigoa_file = open(gaf_file) pmids = {} pmid_prot = {} for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] pmids[pmid] = None if pmid not in pmid_go: pmid_go[pmid] = [inrec['GO_ID']] elif inrec['GO_ID'] not in pmid_go[pmid]: pmid_go[pmid].append(inrec['GO_ID']) if pmid not in pmid_prot: pmid_prot[pmid] = [inrec['DB_Object_ID']] elif inrec['DB_Object_ID'] not in pmid_prot[pmid]: pmid_prot[pmid].append(inrec['DB_Object_ID']) return list(pmids.keys()), pmid_go, pmid_prot
def get_ebi(uri): ''' Fetches GOA file for a species from UniProt using Biopython Retrurns annotations ''' data_folder = os.getcwd() + '/data' fn = uri.split('/')[-1] # Check if the file exists already gaf = os.path.join(data_folder, fn) if (not os.path.isfile(gaf)): # Login to FTP server ebi_ftp = FTP('ftp.ebi.ac.uk') ebi_ftp.login() # Logs in anonymously # Download with open(gaf, 'wb') as fp: ebi_ftp.retrbinary(f'RETR {uri}', fp.write) # Logout from FTP server ebi_ftp.quit() # File is a gunzip file, so we need to open it in this way with gzip.open(gaf, 'rt') as gaf_fp: funcs = {} # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. for entry in GOA.gafiterator(gaf_fp): uniprot_id = entry.pop('DB_Object_ID') funcs[uniprot_id] = entry return funcs
def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'): # Load the Gene Ontology n_comps = metagene_matrix.shape[1] self.download_and_cache_resources( ) # Download ontology and annotations, if necessary gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo') # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) gea = GOEnrichmentStudy(population, associations, gene_ontology, propagate_counts=True, alpha=0.05, methods=[method]) gea_results_by_component = {} rankings = self.ranked_genes_by_component(metagene_matrix) for ci in range(n_comps): study_genes = rankings[ci] print('\nComp. %d: %s...' % (ci, str(study_genes[:10]))) gea_results_by_component[ci] = gea.run_study(study_genes) # Get results into a dataframe per component. Easiest way is to use routine to # write a .tsv file, then read back and filter gea_results_df_by_component = [] for ci in range(n_comps): ge_df = self._perform_gene_enrichment_analysis_one_component( ci, gea_results_by_component, gea) if ge_df is not None: gea_results_df_by_component += [ge_df] # Merge the per-component dataframes into a single one gea_all_sig_results_df = pd.DataFrame() gea_all_sig_results_df = gea_all_sig_results_df.append( gea_results_df_by_component) gea_all_sig_results_df.to_csv(self.cache_dir + '%s_gea_all.tsv' % self.prefix, sep='\t')
def build_clusters(species): """ Build GO Clusters from a species gene association file. The cluster contains a representative GO term, the proteins annotated to this term and all the papers that those proteins appear in. @param unigoa_file: Uniprot_GOA association file in gaf format. """ go_clusters = {} pmid_go = {} pmid_prot = {} unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species)) for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] if inrec['GO_ID'] not in go_clusters: go_clusters[inrec['GO_ID']] = {'proteins':Set([inrec['DB_Object_ID']]), 'papers':Set([pmid])} else: go_clusters[inrec['GO_ID']]['proteins'].add(inrec['DB_Object_ID']) go_clusters[inrec['GO_ID']]['papers'].add(pmid) if pmid not in pmid_go: pmid_go[pmid] = Set([inrec['GO_ID']]) else: pmid_go[pmid].add(inrec['GO_ID']) if pmid not in pmid_prot: pmid_prot[pmid] = Set([inrec['DB_Object_ID']]) else: pmid_prot[pmid].add(inrec['DB_Object_ID']) pickle_data(go_clusters, os.path.join(CURR_PATH, "Pickled_Data/go_clusters_"+species)) pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/pmid_go_"+species)) pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/pmid_prot_"+species))
def build_clusters(species): """ Build GO Clusters from a species gene association file. The cluster contains a representative GO term, the proteins annotated to this term and all the papers that those proteins appear in. @param unigoa_file: Uniprot_GOA association file in gaf format. """ pmid_go_mf = OrderedDict() pmid_go_bp = OrderedDict() pmid_go_cc = OrderedDict() pmid_go = OrderedDict() pmid_prot_mf = OrderedDict() pmid_prot_bp = OrderedDict() pmid_prot_cc = OrderedDict() pmid_prot = OrderedDict() go_prot_mf = OrderedDict() go_prot_bp = OrderedDict() go_prot_cc = OrderedDict() go_prot = OrderedDict() unigoa_file = open(os.path.join(CURR_PATH,"GOA_Files/gene_association.goa_"+species)) for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] if inrec['Aspect'] == 'P': add_to_pmid(pmid_go_bp, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot_bp, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot_bp, inrec['GO_ID'], inrec['DB_Object_ID']) elif inrec['Aspect'] == 'F': add_to_pmid(pmid_go_mf, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot_mf, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot_mf, inrec['GO_ID'], inrec['DB_Object_ID']) elif inrec['Aspect'] == 'C': add_to_pmid(pmid_go_cc, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot_cc, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot_cc, inrec['GO_ID'], inrec['DB_Object_ID']) add_to_pmid(pmid_go, pmid, inrec['GO_ID']) add_to_pmid(pmid_prot, pmid, inrec['DB_Object_ID']) add_to_pmid(go_prot, inrec['GO_ID'], inrec['DB_Object_ID']) pickle_data(pmid_go_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_mf_"+species)) pickle_data(pmid_go_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_cc_"+species)) pickle_data(pmid_go_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_bp_"+species)) pickle_data(pmid_go, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_go_"+species)) pickle_data(pmid_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_mf_"+species)) pickle_data(pmid_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_cc_"+species)) pickle_data(pmid_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_bp_"+species)) pickle_data(pmid_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/pmid_prot_"+species)) pickle_data(go_prot_mf, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_mf_"+species)) pickle_data(go_prot_cc, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_cc_"+species)) pickle_data(go_prot_bp, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_bp_"+species)) pickle_data(go_prot, os.path.join(CURR_PATH, "Pickled_Data/"+species+"/go_prot_"+species))
def get_GO_genes_info(self): lines = [] with open(self.gene_ontology_file_path) as file: l = GOA.gafiterator(file) for line in l: lines.append(line) go_df = pd.DataFrame(lines) return go_df
def pmids_from_gaf(unigoa_file): pmids = {} for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] pmids[pmid] = None return list(pmids.keys()) # I enforced the list cast here because the dict_key is not subscriptable
def test_gaf_iterator(self): """Test GOA GAF file iterator.""" # Test GAF 2.0 recs = [] with open('UniProt/goa_yeast.gaf', 'r') as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 587) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]['DB'], 'UniProtKB') self.assertEqual(recs[0]['DB_Object_ID'], 'A0A023PXA5') self.assertEqual(recs[0]['DB_Object_Symbol'], 'YAL019W-A') self.assertEqual(recs[0]['Qualifier'], ['']) self.assertEqual(recs[0]['GO_ID'], 'GO:0003674') self.assertEqual(recs[0]['DB:Reference'], ['GO_REF:0000015']) self.assertEqual(recs[0]['Evidence'], 'ND') self.assertEqual(recs[0]['With'], ['']) # Test GAF 2.1, it has the same fields as GAF 2.0 recs = [] with open('UniProt/gene_association.goa_yeast.1.gaf', 'r') as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 300) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]['DB'], 'UniProtKB') self.assertEqual(recs[0]['DB_Object_ID'], 'P17536') self.assertEqual(recs[0]['DB_Object_Symbol'], 'TPM1') self.assertEqual(recs[0]['Qualifier'], ['']) self.assertEqual(recs[0]['GO_ID'], 'GO:0000001') self.assertEqual(recs[0]['DB:Reference'], ['PMID:10652251']) self.assertEqual(recs[0]['Evidence'], 'TAS') self.assertEqual(recs[0]['With'], [''])
def test_gaf_iterator(self): """Test GOA GAF file iterator.""" # Test GAF 2.0 recs = [] with open("UniProt/goa_yeast.gaf") as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 587) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]["DB"], "UniProtKB") self.assertEqual(recs[0]["DB_Object_ID"], "A0A023PXA5") self.assertEqual(recs[0]["DB_Object_Symbol"], "YAL019W-A") self.assertEqual(recs[0]["Qualifier"], [""]) self.assertEqual(recs[0]["GO_ID"], "GO:0003674") self.assertEqual(recs[0]["DB:Reference"], ["GO_REF:0000015"]) self.assertEqual(recs[0]["Evidence"], "ND") self.assertEqual(recs[0]["With"], [""]) # Test GAF 2.1, it has the same fields as GAF 2.0 recs = [] with open("UniProt/gene_association.goa_yeast.1.gaf") as handle: for rec in GOA.gafiterator(handle): recs.append(rec) # Check number of records self.assertEqual(len(recs), 300) # Check keys are same as predefined fields self.assertEqual(sorted(recs[0].keys()), sorted(GOA.GAF20FIELDS)) # Check values of first record self.assertEqual(recs[0]["DB"], "UniProtKB") self.assertEqual(recs[0]["DB_Object_ID"], "P17536") self.assertEqual(recs[0]["DB_Object_Symbol"], "TPM1") self.assertEqual(recs[0]["Qualifier"], [""]) self.assertEqual(recs[0]["GO_ID"], "GO:0000001") self.assertEqual(recs[0]["DB:Reference"], ["PMID:10652251"]) self.assertEqual(recs[0]["Evidence"], "TAS") self.assertEqual(recs[0]["With"], [""])
def load_gaf(filename, start=collection.count({})): # load GOA in a flat structure print("Loading %s" % filename) collection.create_index("DB_Object_ID") collection.create_index("DB") collection.create_index("GO_ID") collection.create_index("Evidence") collection.create_index("Aspect") collection.create_index("Date") collection.create_index("DB_Object_Symbol") with open(filename, 'r') as handler: goa_iterator = GOA.gafiterator(handler) for i, data in enumerate(goa_iterator): if i % 100 == 0: sys.stdout.write("\rProcessed annotations\t%s" % i) if i < start or (args.exp and data['Evidence'] not in exp_codes): continue date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date() json = { "DB_Object_ID": data['DB_Object_ID'], "DB_Object_Symbol": data['DB_Object_Symbol'], "With": data['With'], "Assigned_By": data['Assigned_By'], "Annotation_Extension": data['Annotation_Extension'], "Gene_Product_Form_ID": data['Gene_Product_Form_ID'], "DB:Reference": data['DB:Reference'], "GO_ID": data['GO_ID'], "Qualifier": data['Qualifier'], "Date": datetime.datetime.fromordinal(date.toordinal()), "DB": data['DB'], "created_at": datetime.datetime.utcnow(), "DB_Object_Name": data['DB_Object_Name'], "DB_Object_Type": data['DB_Object_Type'], "Evidence": data['Evidence'], "Taxon_ID": data['Taxon_ID'], "Aspect": data['Aspect'] } collection.update_one({"_id": i}, {'$set': json}, upsert=True) print("\nFinished!")
def extract_taxon(handle, in_taxid): """ Create a GAF file from a single taxon """ header = "!gaf-version: 2.0\n" if isinstance(in_taxid, int): taxid = str(in_taxid) taxid = in_taxid.strip() outfile = open("%s.taxon.%s" % (handle.name, taxid), 'w') outfile.write(header) for inrec in upg.gafiterator(handle): if inrec['Taxon_ID'][0].split(':')[1] == taxid: upg.writerec(inrec, outfile) outfile.close()
def get_pmids_from_gaf(gaf_file): """ Get the papers cited in the Uniprot_GOA file by their PMID and get the GO terms each paper contained. @param unigoa_file: Uniprot_GOA association file in gaf format. """ unigoa_file = open(gaf_file) pmids = {} for inrec in GOA.gafiterator(unigoa_file): for dbref in inrec['DB:Reference']: if dbref[:4] == 'PMID': pmid = dbref[5:] pmids[pmid] = None return list(pmids.keys())
def extract_taxa(handle, taxalist): """ Create a GAF file from multiple taxa taxalist is a list of strings of taxid. Don't use list of int """ outfiles = {} header = "!gaf-version: 2.0\n" for taxid in taxalist: outfiles[taxid] = open("%s.taxon.%s" % (handle.name, taxid), 'w') outfiles[taxid].write(header) for inrec in upg.gafiterator(handle): cur_taxid = inrec['Taxon_ID'][0].split(':')[1] if cur_taxid in taxalist: upg.writerec(inrec, outfiles[cur_taxid]) for i in outfiles: outfiles[i].close()
def load(self, filename: str, organism_name: str, annotation_level: str): """Import data from a GAF file into a Chado database""" # Load dependencies default_organism = self._load_organism(organism_name) features_with_product = set() # Loop over all records in the GAF file with open(filename) as f: for gaf_record in GOA.gafiterator(f): # Import this record into the database self._load_gaf_record(gaf_record, default_organism, annotation_level, features_with_product) # Commit changes self.session.commit()
def GOTermCounter(file,ontology,term): D={} gaf=goa.gafiterator(file) for entry in gaf: if(entry['GO_ID'] in root_terms or entry['Evidence'] not in EC): """print entry['GO_ID']""" else: if(entry['Aspect']==ontology and term=='GO term'): if(entry['GO_ID'] not in D.keys()): D[entry['GO_ID']]=1 else: D[entry['GO_ID']]+=1 if(entry['Aspect']==ontology and term=='PMID'): for refs in entry['DB:Reference']: if(re.match("PMID",refs)): if(refs not in D.keys()): D[refs]=1 else: D[refs]+=1 return D
def read_gaf_write_tab(gaf_file, include_mfo, outfile): Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])} if include_mfo: Aspect = {'Aspect':set(['P','F'])} else: Aspect = {'Aspect':set(['P'])} Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC'])} outhandle = open(outfile, 'w') ingafhandle = open(gaf_file,'r') counter = 0 for rec in GOA.gafiterator(ingafhandle): if GOA.record_has(rec, Aspect): if GOA.record_has(rec, Evidence): prot = rec['DB_Object_ID'] go = rec['GO_ID'] outhandle.write("%s\t%s\n" % (prot, go)) counter +=1 ingafhandle.close() outhandle.close() return(counter)
def split_to_ontologies(handle): """Splits a GAF file into three ontology files """ header = "!gaf-version: 2.0\n" out_mfo = open("%s.MFO" % handle.name, 'w') out_bpo = open("%s.BPO" % handle.name, 'w') out_cco = open("%s.CCO" % handle.name, 'w') out_bpo.write(header) out_mfo.write(header) out_cco.write(header) for inrec in upg.gafiterator(handle): if inrec['Aspect'] == 'F': upg.writerec(inrec, out_mfo) elif inrec['Aspect'] == 'P': upg.writerec(inrec, out_bpo) elif inrec['Aspect'] == 'C': upg.writerec(inrec, out_cco) else: raise ValueError, 'unknown ontology aspect %s' % inrec['Aspect'] out_mfo.close() out_bpo.close() out_cco.close()
def load_dataframe(self, file_resources, npartitions=None): go_annotation_dfs = [] for file in file_resources: if ".gaf" in file: go_lines = [] for line in GOA.gafiterator(file_resources[file]): go_lines.append(line) go_annotation_dfs.append(pd.DataFrame(go_lines)) go_annotations = pd.concat(go_annotation_dfs) go_terms = pd.DataFrame.from_dict(self.network.nodes, orient="index", dtype="object") go_annotations["go_name"] = go_annotations["GO_ID"].map( go_terms["name"]) go_annotations["namespace"] = go_annotations["GO_ID"].map( go_terms["namespace"]) go_annotations["is_a"] = go_annotations["GO_ID"].map(go_terms["is_a"]) return go_annotations
def ProteinGafRDF(files, map_ds, output_file): assoc_line = 0 rdf_buffer = '' previous_obj_id = '' list_records = list() uniq_obj_id = {} pp = pprint.PrettyPrinter(indent=4) # if flag == 'protein' or flag == 'gene' or flag == 'qtl': # output_file += flag + "_associations.ttl" outputWriter = open(output_file, "w") # Printing prefixes outputWriter.write(base + "\t" + "<" + base_uri + "> .\n") outputWriter.write(pr + "\t" + rdf_ns + "<" + rdf + "> .\n") outputWriter.write(pr + "\t" + rdfs_ns + "<" + rdfs + "> .\n") outputWriter.write(pr + "\t" + owl_ns + "<" + owl + "> .\n") outputWriter.write(pr + "\t" + xsd_ns + "<" + xsd + "> .\n") outputWriter.write(pr + "\t" + base_vocab_ns + "<" + base_vocab_uri + "> .\n") outputWriter.write(pr + "\t" + obo_ns + "<" + obo_uri + "> .\n") outputWriter.write(pr + "\t" + sio_ns + "<" + sio_uri + "> .\n") # outputWriter.write(pr + "\t" + ncbi_tax_ns + "<" + ncbi_tax_uri + "> .\n") outputWriter.write(pr + "\t" + gr_assoc_ns + "<" + gr_assoc + "> .\n") outputWriter.write(pr + "\t" + goa_ns + "<" + goa_uri + "> .\n") outputWriter.write(pr + "\t" + up_ns + "<" + uniprot + "> .\n\n") #Ajout du prefix pour la realese des donnees outputWriter.write(pr + "\t" + res_ns + "<" + resource + "> .\n\n") #opener = open(files, "r") # Slurping all the gaf records into gaf_objs list for infile in files: print(infile) opener = open(infile, "r") gaf_objs = GOA.gafiterator(opener) for record in gaf_objs: list_records.append(record) #append(record) extend(record) opener.close() list_records.sort(key=lambda x: x['DB_Object_ID']) # pp.pprint(list_records) # Accessing individual associations for inline in list_records: taxon = ''.join(inline['Taxon_ID']) tax_id = taxon.lstrip('taxon:') # d = inline['Date'] date = inline['Date'][:4] + "-" + inline['Date'][4:6] + "-" + inline[ 'Date'][6:] if tax_id not in taxon_ids: continue assoc_line += 1 ont_term = inline['GO_ID'].replace(":", "_") current_obj_id = inline['DB_Object_ID'] aspect = inline['Aspect'] go_pattern = re.match(r'^GO', ont_term) evidence_code = inline['Evidence'] # db_ref = inline['DB:Reference'] # print db_ref # Flush if previous_obj_id and current_obj_id not in previous_obj_id: rdf_buffer = re.sub(' ;$', ' .', rdf_buffer) outputWriter.write(rdf_buffer) rdf_buffer = '' if current_obj_id not in uniq_obj_id: rdf_buffer += up_ns + current_obj_id + "\n" rdf_buffer += "\t" + rdf_ns + "type" + "\t" + res_ns + "Protein" + " ;\n" #rdf_buffer += "\t" + rdf_ns + "type" + "\t" + owl_ns + "Class" + " ;\n" #rdf_buffer += "\t" + rdfs_ns + "subClassOf" + "\t" + obo_ns + protein_term + " ;\n" rdf_buffer += "\t" + rdfs_ns + "label" + "\t" + '"%s"' % ( inline['DB_Object_Symbol']) + " ;\n" rdf_buffer += "\t" + base_vocab_ns + "description" + "\t" + '"%s"' % ( inline['DB_Object_Name']) + " ;\n" for synonym in inline['Synonym']: if synonym: rdf_buffer += "\t" + base_vocab_ns + "has_synonym" + "\t" + '"%s"' % ( synonym) + " ;\n" rdf_buffer += "\t" + base_vocab_ns + "taxon" + "\t" + obo_ns + "NCBITaxon_" + tax_id + " ;\n" uniq_obj_id[current_obj_id] = 1 previous_obj_id = current_obj_id # Reification if go_pattern: # outputWriter.write(goa_ns + current_obj_id + "_" + ont_term + "\n") outputWriter.write(goa_ns + current_obj_id + "\n") else: outputWriter.write(gr_assoc_ns + current_obj_id + "_" + ont_term + "\n") # outputWriter.write(base_ns + "triple_" + current_obj_id + "_" + ont_term + "_" + str(assoc_line) + "\n") outputWriter.write("\t" + rdf_ns + "type" + "\t" + rdf_ns + "Statement" + " ;\n") outputWriter.write("\t" + rdfs_ns + "subClassOf" + "\t" + sio_ns + sio_term + " ;\n") outputWriter.write("\t" + rdf_ns + "subject" + "\t" + up_ns + current_obj_id + " ;\n") outputWriter.write("\t" + rdf_ns + "predicate" + "\t" + base_vocab_ns + ont_aspects[aspect] + " ;\n") outputWriter.write("\t" + rdf_ns + "object" + "\t" + obo_ns + ont_term + " ;\n") if evidence_code in map_ds: for db_ref in inline['DB:Reference']: if db_ref in map_ds[evidence_code]: eco_id = map_ds[evidence_code][db_ref].replace(":", "_") outputWriter.write("\t" + base_vocab_ns + "evidence" + "\t" + obo_ns + eco_id + " ;\n") outputWriter.write("\t" + base_vocab_ns + "evidence_code" + "\t" + '"%s"' % (evidence_code) + " ;\n") else: eco_id = map_ds[evidence_code]['Default'].replace(":", "_") outputWriter.write("\t" + base_vocab_ns + "evidence" + "\t" + obo_ns + eco_id + " ;\n") outputWriter.write("\t" + base_vocab_ns + "evidence_code" + "\t" + '"%s"' % (evidence_code) + " ;\n") else: outputWriter.write("\t" + base_vocab_ns + "evidence_code" + "\t" + '"%s"' % (evidence_code) + " ;\n") # outputWriter.write("\t" + base_vocab_ns + "evidence" + "\t" + '"%s"' % (inline['Evidence']) + " ;\n") outputWriter.write("\t" + base_vocab_ns + "assigned_by" + "\t" + '"%s"' % (inline['Assigned_By']) + " ;\n") outputWriter.write("\t" + base_vocab_ns + "date" + "\t" + '"%s"' % (date) + "^^" + xsd_ns + "date" + " .\n") # Flushing if current_obj_id == previous_obj_id: rdf_buffer += "\t" + base_vocab_ns + ont_aspects[ aspect] + "\t" + obo_ns + ont_term + " ;\n" if go_pattern: rdf_buffer += "\t" + base_vocab_ns + "has_annotation" + "\t" + goa_ns + current_obj_id + " ;\n" else: rdf_buffer += "\t" + base_vocab_ns + "has_annotation" + "\t" + gr_assoc_ns + current_obj_id + "_" + ont_term + " ;\n" previous_obj_id = current_obj_id # Last Flush if previous_obj_id: rdf_buffer = re.sub(' ;$', ' .', rdf_buffer) outputWriter.write(rdf_buffer) outputWriter.close() print "Total number of associations: %s\n" % (str(assoc_line))
import Bio.UniProt.GOA as goa import sys import Bio.Entrez as ez """"" Retrieving protein references from the yeast association file in GAF 2.0 format according to different criteria """ """" Retrieve all references cited to annotate proteins with Experimental Evidence Codes """ handle = open("gene_association.goa_yeast") # open the association gene file of the yeast proteins = goa.gafiterator(handle) # read all records in the file Evidences = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"])} print ("GO-annotated proteins supported by Experimental Evidence Code") for protein in proteins: if goa.record_has(protein, Evidences): print(protein['DB:Reference']) """" Retrieve all references cited to annotate proteins with Experimental Evidence Codes in the Molecular Function aspect of GO """ handle = open("gene_association.goa_yeast") proteins = goa.gafiterator(handle) Evi_Aspect = {"Evidence":set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"]), "Aspect":set(["F"])} print ("GO-annotated proteins supported by Experimental Evidence Code in the Molecular Function Ontology") for protein in proteins: if goa.record_has(protein, Evi_Aspect):
#!/usr/bin/env python import sys import argparse import target_prep as tp from Bio.UniProt import GOA as upg if __name__ == '__main__': # parser = argparse.ArgumentParser(description='Filter by field') # parser.add_argument('-o','--output') # parser.add_argument('-f','--field') outhandle = sys.stdout if len(sys.argv) == 5: outhandle = open(sys.argv[4],"w") outhandle.write('!gaf-version: 2.0\n') goodvals = {sys.argv[1]: set(sys.argv[2].split(','))} for inrec in upg.gafiterator(open(sys.argv[3])): if upg.record_has(inrec, goodvals): upg.writerec(inrec,outhandle)
return record if __name__ == '__main__': gaf_file = sys.argv[1] gpi_file = sys.argv[2] taxon = sys.argv[3] outfile = open("filtered_gaf_file_with_only_sp_ids_for_" + taxon + ".gaf", 'w') gaf_handle = open(gaf_file, 'r') record = [] sp_id = parse_gpi(gpi_file, taxon) parser = GOAParser.gafiterator(gaf_handle) for rec in parser: if len(rec) == 15: GAFFIELDS = GOAParser.GAF10FIELDS break elif len(rec) == 17: GAFFIELDS = GOAParser.GAF20FIELDS break for rec in parser: record = extract_gaf(rec, outfile, GAFFIELDS, record, sp_id, taxon) new_record = tuple(record) insert_into_db(new_record, taxon, GAFFIELDS)
import pandas as pd import Bio.UniProt.GOA as GOA def take_my_Y(synonym_list): for gene in synonym_list: if gene.startswith('Y'): return gene GO_file = '/home/sergio/workspace_Eclipse/Lucky_GOGO/Results/Python_Srcipts/go-basic.obo' sc_GAF_file = '/home/sergio/workspace_Eclipse/Lucky_GOGO/Results/Python_Srcipts/sgd.gaf' go_dag = obo_parser.GODag(GO_file) with open(sc_GAF_file, 'rt') as fp: sc_gaf = pd.DataFrame(annotation for annotation in GOA.gafiterator(fp)) sc_gaf = sc_gaf[sc_gaf['Evidence'].isin( ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'])] sc_gaf = sc_gaf[sc_gaf['Aspect'] == 'F'] sc_gaf['Yeast_ID'] = [take_my_Y(gene) for gene in sc_gaf['Synonym']] sergio_df = sc_gaf[['Yeast_ID', 'GO_ID']].drop_duplicates() new_sergio_df = pd.DataFrame(columns=['Yeast_ID', 'GO_ID']) for _, row in sergio_df.iterrows(): new_sergio_df = new_sergio_df.append(row) parents = pd.DataFrame([ (row.Yeast_ID, parent_id) for parent_id in go_dag[row.GO_ID].get_all_parents() if go_dag[parent_id].namespace == 'molecular_function'
# Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): go_obo = wget.download(go_obo_url, data_folder + '/go-basic.obo') else: go_obo = data_folder + '/go-basic.obo' go = obo_parser.GODag(go_obo) methods = ["bonferroni", "fdr"] assoc = {} with gzip.open(arab_gaf, 'rt') as arab_gaf_fp: arab_funcs = {} # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. for entry in GOA.gafiterator(arab_gaf_fp): uniprot_id = entry.pop('DB_Object_ID') arab_funcs[uniprot_id] = entry pop = arab_funcs.keys() for x in arab_funcs: if x not in assoc: assoc[x] = set() assoc[x].add(str(arab_funcs[x]['GO_ID'])) target_gene = ["DNAJC19"] gene_names = [ 'DLX6', 'MBTD1', 'TRHDE', 'NAALAD2', 'CD82', 'AURKA', 'TEKT2', 'PYCARD', 'TULP2', 'DLX5', 'QPCT', 'PCDH17', 'DNAJC15', 'CCRL2', 'CTCFL', 'EML2', 'RIPK3', 'ACY3', 'BTF3L4', 'MSI1', 'LACRT', 'SLC46A3', 'NOVA1', 'DMRTB1', 'ANKRD31', 'SDK1', 'NAPRT', 'CRB2', 'LRRC4C',
def parseGAF(database='PDB', **kwargs): """Parse a GO Association File (GAF) corresponding to a particular database collection into a dictionary for ease of querying. See `GAF`_ for more information on the file format .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/ :arg database: name of the database of interest default is PDB. Others include UNIPROT and common names of many organisms. :type database: str :arg filename: filename for the gaf of interest default is goa_ and the database name in lower case and .gaf.gz :type filename: str """ import Bio.UniProt.GOA as GOA if not isinstance(database, str): raise TypeError('database should be a string') database = database.upper() filename = kwargs.get('filename', None) if filename is None: if database == 'UNIPROT': filename = 'goa_' + database.lower() + '_all.gaf.gz' else: filename = 'goa_' + database.lower() + '.gaf' data_folder = kwargs.get('data_folder', os.getcwd()) # If the file doesn't already exist, download it gaf = os.path.join(data_folder, filename) if not (os.path.exists(gaf) and os.path.getsize(gaf) > 0): LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf)) data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() try: ftp.cwd('pub/databases/GO/goa') ftp.cwd(database) ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write) except: raise ValueError('Cannot find the requested GO association file') # Logout from FTP server ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() with open(filename, 'w') as gaf_fp: gaf_fp.write(rawdata) LOGGER.info('Download completed for file {0}'.format(filename)) with open(filename, 'rt') as gaf_fp: funcs = defaultdict(list) # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. LOGGER.info('Iterating through entries in {0}'.format(gaf)) for entry in GOA.gafiterator(gaf_fp): id = entry.pop('DB_Object_ID') funcs[id].append(entry) return funcs
def parseGAF(database='PDB', **kwargs): """Parse a GO Association File (GAF) corresponding to a particular database collection into a dictionary for ease of querying. See `GAF`_ for more information on the file format .. _GAF: http://geneontology.org/docs/go-annotation-file-gaf-format-20/ :arg database: name of the database of interest default is PDB. Others include UNIPROT and common names of many organisms. :type database: str :arg filename: filename for the gaf of interest default is goa_ and the database name in lower case and .gaf.gz :type filename: str """ import Bio.UniProt.GOA as GOA if not isinstance(database, str): raise TypeError('database should be a string') database = database.upper() filename = kwargs.get('filename', None) if filename is None: if database == 'UNIPROT': filename = 'goa_' + database.lower() + '_all.gaf.gz' else: filename = 'goa_' + database.lower() + '.gaf' data_folder = kwargs.get('data_folder', os.getcwd()) # If the file doesn't already exist, download it gaf = os.path.join(data_folder, filename) if not(os.path.exists(gaf) and os.path.getsize(gaf) > 0): LOGGER.info('Downloading file {0} to {1}'.format(filename, gaf)) data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() try: ftp.cwd('pub/databases/GO/goa') ftp.cwd(database) ftp.retrbinary('RETR {}.gz'.format(filename), data_stream.write) except: raise ValueError('Cannot find the requested GO association file') # Logout from FTP server ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() with open(filename, 'w') as gaf_fp: gaf_fp.write(rawdata) LOGGER.info('Download completed for file {0}'.format(filename)) with open(filename, 'rt') as gaf_fp: funcs = defaultdict(list) # Initialise the dictionary of functions # Iterate on each function using Bio.UniProt.GOA library. LOGGER.info('Iterating through entries in {0}'.format(gaf)) for entry in GOA.gafiterator(gaf_fp): id = entry.pop('DB_Object_ID') funcs[id].append(entry) return funcs
""" Returns the pmids of the papers this paper cites """ cites_list = [] handle = ez.efetch("pubmed", id=pmid, retmode="xml") pubmed_rec = ez.parse(handle).__next__() for ref in pubmed_rec['MedlineCitation']['CommentsCorrectionsList']: if ref.attributes['RefType'] == 'Cites': cites_list.append(str(ref['PMID'])) return cites_list f = open ("papers and citations.txt","w") st = "GO-annotated proteins supported by IGI evidence (Inferred from Genetic Interaction)\n" handle = open("gene_association.goa_yeast") proteins = goa.gafiterator(handle) Evi_Aspect = {"Evidence":set(["IGI"])} for protein in proteins: if goa.record_has(protein, Evi_Aspect): for p in protein['DB:Reference']: if p[:4] == "PMID": st += "Main PubMed reference: "+ p +"\n" citations = get_citations(p[5:]) for cit in citations: st += cit + " " st += "\n" f.write(st) f.close()
DATA_DIR = "/data/dd-analysis" #LOAD DATABASE ANNOTATIONS refseq_genes = pd.read_csv("/data/genomes/annotations/refseq_genes_export.csv", delimiter="\t") entrez = pd.read_csv( "/data/genomes/annotations/Homo_sapiens.GRCh38.95.entrez.tsv", delimiter="\t") refseq_xref = pd.read_csv( "/data/genomes/annotations/Homo_sapiens.GRCh38.95.refseq.tsv", delimiter="\t") #LOAD GENE ONTOLOGIES FOR HG38 from Bio.UniProt import GOA fopen = open("/data/genomes/annotations/goa_human.gaf") itr = GOA.gafiterator(fopen) records = list(itr) ontologies = pd.DataFrame.from_dict(records) def init_go_terms(tmpfolder, dataset): dsname = dataset["dataset"] print(tmpfolder, dataset) #READ TRANSCRIPT ALIGNMENTS samfile = pysam.AlignmentFile( "/data/dd-analysis/datasets/{}/tophat/accepted_hits.bam".format( dsname), "rb") all_alignments = [a for a in samfile] names = [e.reference_name for e in all_alignments]