def load_gaf(filename, collection, start=0): print("Loading: %s" % filename) collection.create_index("DB_Object_ID") collection.create_index("DB") collection.create_index("GO_ID") collection.create_index("Evidence") collection.create_index("Aspect") collection.create_index("Date") n = count_lines(filename, sep=bytes('\n', 'ascii')) pbar = tqdm(range(n), desc="annotations loaded") with open(filename, 'r') as handler: goa_iterator = gafiterator(handler) for i, data in enumerate(goa_iterator): if i < start \ or (args.noiea and data['Evidence'] == 'IEA') \ or (args.exp and data['Evidence'] not in exp_codes): pbar.update(1) continue date = datetime.datetime.strptime(data['Date'], "%Y%m%d").date() json = { "DB_Object_ID": data['DB_Object_ID'], "DB_Object_Symbol": data['DB_Object_Symbol'], "With": data['With'], "Assigned_By": data['Assigned_By'], "Annotation_Extension": data['Annotation_Extension'], "Gene_Product_Form_ID": data['Gene_Product_Form_ID'], "DB:Reference": data['DB:Reference'], "GO_ID": data['GO_ID'], "Qualifier": data['Qualifier'], "Date": datetime.datetime.fromordinal(date.toordinal()), "DB": data['DB'], "created_at": datetime.datetime.utcnow(), "DB_Object_Name": data['DB_Object_Name'], "DB_Object_Type": data['DB_Object_Type'], "Evidence": data['Evidence'], "Taxon_ID": data['Taxon_ID'], "Aspect": data['Aspect'] } collection.update_one({"_id": i}, {'$set': json}, upsert=True) pbar.update(1) pbar.close() print("\nFinished!")
def go_annotations(filepath: str): """Handles GO annotation file io. # Arguments filepath: str, filepath of GO annotation file # Returns gafiterator # Raises FileNotFoundError """ try: with open(filepath) as f: iterator = gafiterator(f) yield iterator except FileNotFoundError as err: raise
def parse_gaf(fn): if os.path.splitext(fn)[1].lower()[-3:] == '.gz': open_func = gzip.open else: open_func = open with open_func(fn, 'rb') as f: it = gafiterator(f) gene_by_go_term = {} go_term_by_gene = {} for rec in it: gs = rec['DB_Object_Symbol'] go_id = rec['GO_ID'] if go_id not in gene_by_go_term: gene_by_go_term[go_id] = [] if gs not in go_term_by_gene: go_term_by_gene[gs] = [] gene_by_go_term[go_id].append(gs) go_term_by_gene[gs].append(go_id) return gene_by_go_term, go_term_by_gene
def getAnnotations(): # uses database from uniprot to count the number of annotations each GO term has, and returns a dictionary of GO IDs and their number of annotations annotDict = {} for goID in G.nodes(): annotDict[goID] = 0 filename = 'data/goa_uniprot_all.gaf.gz' with gzip.open(filename, 'rt') as fp: count = 0 for annotation in gafiterator(fp): count += 1 percentage = float(count) * 100 / 424606000 if percentage > 5: print('hi') count = 0 goID = unicode(annotation['GO_ID']) try: annotDict[goID] += 1 except KeyError: print(goID) return annotDict
newEnsemblIDs = [] for i in EnsemblIDs: if i in UP_ID['EnsemblID']: newEnsemblIDs.append(i) C_int_UP = list(map(lambda x: ensembl_to_up[x], newEnsemblIDs)) #Enrichment Analysis go = obo.GODag('/disks/strw13/DBDM/A4_2/go-basic.obo') with gzip.open('goa_human.gaf.gz', 'rt') as fp: funcs = {} for entry in gafiterator(fp): uniprot_id = entry.pop('DB_Object_ID') funcs[uniprot_id] = entry pop = funcs.keys() assoc = {} for x in funcs: if x not in assoc: assoc[x] = set() assoc[x].add(str(funcs[x]['GO_ID'])) dictionary = {x: funcs[x] for x in funcs if x in C_int_UP}
#handle = "goa_cow.gaf" #handle = "goa_dog.gaf" #handle = "goa_pig.gaf" #handle = "goa_fly.gaf" #handle = "goa_worm.gaf" #handle = "goa_yeast.gaf" #handle = open(handle) genes = defaultdict(Gene) reg_targets = defaultdict(int) annext_cnt = 0 handle = open(gaffile) for rec in gafiterator(handle): if rec["Annotation_Extension"] and rec["DB"] == "UniProtKB": # print rec["Annotation_Extension"] protid = rec["DB_Object_ID"] genes[protid].add_annotation(rec) annext_cnt += 1 print("%d annotations has extensions" % annext_cnt) dbs = {"UniProtKB": []} #%% Genes has_regulation_target for gene_id, gene in genes.items(): # print("gene", gene_id)
import gzip g = GODag(obo_file='data/go.obo') """ 'DB': the protein database; 'DB_Object_ID': protein ID; 'Qualifier': annotation qualifier (such as NOT); 'GO_ID': GO term; 'Evidence': evidence code """ # filename = <LOCATION OF GAF FILE> filename = 'data/goa_human.gaf.gz' with gzip.open(filename, 'rt') as fp: x ={annotation['DB_Object_ID']:annotation['GO_ID'] for annotation in gafiterator(fp)} """ To get the GO entry for a particular protein's accession number: g[x['O43707']] and use .name etc, to get terms of that GO entry """ f = 'output/result.pkl' with open(f, 'rb') as file: df = pickle.load(file) go_hits = [g[x[y]] if y in x else '' for y in df['uniprot']] name_hits = [y.name if y!='' else '' for y in go_hits]