def pull_ncbigene_labels_and_synonyms(): #File format described here: https://ftp.ncbi.nih.gov/gene/DATA/README ifname = make_local_name('gene_info.gz', subpath='NCBIGene') labelname = make_local_name('labels', subpath='NCBIGene') synname = make_local_name('synonyms', subpath='NCBIGene') bad_gene_types = set(['biological-region', 'other', 'unknown']) with gzip.open(ifname, 'r') as inf, open(labelname, 'w') as labelfile, open(synname, 'w') as synfile: h = inf.readline() for line in inf: sline = line.decode('utf-8') x = sline.strip().split('\t') gene_id = f'NCBIGene:{x[1]}' symbol = x[2] gene_type = x[9] if gene_type in bad_gene_types: continue labelfile.write(f'{gene_id}\t{symbol}\n') syns = set(x[4].split('|')) syns.add(symbol) description = x[8] syns.add(description) authoritative_symbol = x[10] syns.add(authoritative_symbol) authoritative_full_name = x[11] syns.add(authoritative_full_name) others = set(x[13].split('|')) syns.update(others) for syn in syns: synfile.write(f'{gene_id}\t{syn}\n')
def pull_hgnc_labels_and_synonyms(infile): with open(infile, 'r') as data: hgnc_json = json.load(data) lname = make_local_name('labels', subpath='HGNC') sname = make_local_name('synonyms', subpath='HGNC') with open(lname, 'w') as lfile, open(sname, 'w') as sfile: for gene in hgnc_json['response']['docs']: hgnc_id = gene['hgnc_id'] symbol = gene['symbol'] lfile.write(f'{hgnc_id}\t{symbol}\n') name = gene['name'] sfile.write( f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n' ) if 'alias_symbol' in gene: alias_symbols = gene['alias_symbol'] for asym in alias_symbols: sfile.write( f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n' ) if 'alias_name' in gene: alias_names = gene['alias_name'] for asym in alias_names: sfile.write( f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n' )
def pull_umls(): """Run through MRCONSO.RRF creating label and synonym files for UMLS and SNOMEDCT""" mrcon = os.path.join('input_data', 'MRCONSO.RRF') rows = defaultdict(list) priority = read_umls_priority() snomed_label_name = make_local_name('labels', subpath='SNOMEDCT') snomed_syn_name = make_local_name('synonyms', subpath='SNOMEDCT') with open(mrcon, 'r') as inf, open(snomed_label_name, 'w') as snolabels, open(snomed_syn_name, 'w') as snosyns: for line in inf: x = line.strip().split('|') cui = x[0] lang = x[1] #Only keep english terms if lang != 'ENG': continue #only keep unsuppressed rows suppress = x[16] if suppress == 'O' or suppress == 'E': continue source = x[11] termtype = x[12] term = x[14] #While we're here, if this thing is snomed, lets get it if source == 'SNOMEDCT_US': snomed_id = f'SNOMEDCT:{x[15]}' if termtype == 'PT': snolabels.write(f'{snomed_id}\t{term}\n') snosyns.write( f'{snomed_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{term}\n' ) #UMLS is a collection of sources. They pick one of the names from these sources for a concept, # and that's based on a priority that they define. Here we get the priority for terms so we # can get the right one for the label pkey = (source, termtype, suppress) try: pri = priority[pkey] except: #print(pkey) pri = 1000000 rows[cui].append((pri, term, line)) lname = make_local_name('labels', subpath='UMLS') sname = make_local_name('synonyms', subpath='UMLS') with open(lname, 'w') as labels, open(sname, 'w') as synonyms: for cui, crows in rows.items(): crows.sort() labels.write(f'{UMLS}:{cui}\t{crows[0][1]}\n') syns = set([crow[1] for crow in crows]) for s in syns: synonyms.write( f'{UMLS}:{cui}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{s}\n' )
def pull_prot(which,refresh): #swissname = pull_via_ftplib('ftp.uniprot.org','/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',decompress_data=True,outfilename=f'uniprot_{which}.fasta') if refresh: swissname = pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz') else: swissname = make_local_name(f'uniprot_{which}.fasta') swissprot_labels = {} nlines = 0 maxn = 1000 with open(swissname,'r') as inf: for line in inf: nlines += 1 if line.startswith('>'): #example fasta line: #>sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1 x = line.split('|') uniprotid = f'UniProtKB:{x[1]}' name = x[2].split(' OS=')[0] swissprot_labels[uniprotid] = f'{name} ({which})' #if nlines > maxn: # break print('numlines',nlines) print('nl',len(swissprot_labels)) swissies = [ (k,) for k in swissprot_labels.keys() ] print('s',len(swissies)) return swissies, swissprot_labels
def pull_prots(refresh_swiss=False,refresh_trembl=False): swiss,labels = pull_prot('sprot',refresh_swiss) fname = make_local_name('labels', subpath='UNIPROTKB') with open(fname,'w') as synonyms: for k,v in labels.items(): synonyms.write(f'{k}\t{v}\n') tremb,tlabels = pull_prot('trembl',refresh_trembl) for k,v in tlabels.items(): synonyms.write(f'{k}\t{v}\n')
def pull_pubchem_labels(): print('LABEL PUBCHEM') f_name = 'CID-Title.gz' cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name) fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND') with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf: for line in inf: x = line.strip().split('\t') outf.write(f'PUBCHEM.COMPOUND:{x[0]}\t{x[1]}\n')
def __init__(self): ifname = make_local_name('mesh.nt', subpath='MESH') from datetime import datetime as dt print('loading mesh.nt') start = dt.now() self.m = pyoxigraph.MemoryStore() with open(ifname, 'rb') as inf: self.m.load(inf, 'application/n-triples') end = dt.now() print('loading complete') print(f'took {end-start}')
def pull_hgnc(): data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json') hgnc_json = loads(data) lname = make_local_name('labels', subpath='HGNC') sname = make_local_name('synonyms', subpath='HGNC') with open(lname,'w') as lfile, open(sname,'w') as sfile: for gene in hgnc_json['response']['docs']: hgnc_id =gene['hgnc_id'] symbol = gene['symbol'] lfile.write(f'{hgnc_id}\t{symbol}\n') name = gene['name'] sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n') if 'alias_symbol' in gene: alias_symbols = gene['alias_symbol'] for asym in alias_symbols: sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n') if 'alias_name' in gene: alias_names = gene['alias_name'] for asym in alias_names: sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
def __init__(self): ifname = make_local_name('rhea.rdf', subpath='RHEA') from datetime import datetime as dt print('loading rhea') start = dt.now() self.m = pyoxigraph.MemoryStore() with open(ifname, 'rb') as inf: self.m.load(inf, 'application/rdf+xml') end = dt.now() print('loading complete') print(f'took {end-start}')
def pull_pubchem_synonyms(): f_name = 'CID-Synonym-filtered.gz' sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name) fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND') with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf: for line in inf: x = line.strip().split('\t') if x[1].startswith('CHEBI'): continue if x[1].startswith('SCHEMBL'): continue outf.write(f'PUBCHEM.COMPOUND:{x[0]}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{x[1]}\n')
def readlabels(which): swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta') swissprot_labels = {} with open(swissname,'r') as inf: for line in inf: if line.startswith('>'): #example fasta line: #>sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1 x = line.split('|') uniprotid = f'UniProtKB:{x[1]}' name = x[2].split(' OS=')[0] swissprot_labels[uniprotid] = f'{name} ({which})' return swissprot_labels
def pull_uber_labels(expected): uber = UberGraph() labels = uber.get_all_labels() ldict = defaultdict(set) for unit in labels: iri = unit['iri'] p = iri.split(':')[0] ldict[p].add((unit['iri'], unit['label'])) for p in ldict: if p not in ['http', 'ro'] and not p.startswith('t') and not '#' in p: fname = make_local_name('labels', subpath=p) with open(fname, 'w') as outf: for unit in ldict[p]: outf.write(f'{unit[0]}\t{unit[1]}\n')
def pull_uber_synonyms(expected): uber = UberGraph() labels = uber.get_all_synonyms() ldict = defaultdict(set) for unit in labels: iri = unit[0] p = iri.split(':')[0] ldict[p].add(unit) #There are some of the ontologies that we don't get synonyms for. But this makes snakemake unhappy so # we are going to make some zero-length files for it for p in expected: if p not in ['http', 'ro'] and not p.startswith('t') and not '#' in p: fname = make_local_name('synonyms', subpath=p) with open(fname, 'w') as outf: for unit in ldict[p]: outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n')
def pull_mesh_labels(self): s = """ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#> PREFIX mesh: <http://id.nlm.nih.gov/mesh/> SELECT DISTINCT ?term ?label WHERE { ?term rdfs:label ?label } ORDER BY ?term """ ofname = make_local_name('labels', subpath='MESH') qres = self.m.query(s) with open(ofname, 'w', encoding='utf8') as outf: for row in list(qres): iterm = str(row['term']) ilabel = str(row['label']) meshid = iterm[:-1].split('/')[-1] label = ilabel.strip().split('"')[1] outf.write(f'{MESH}:{meshid}\t{label}\n')
def pull_ensembl(complete_file): f = find_datasets() cols = set([ "ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source", "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene' ]) for ds in f['Dataset_ID']: print(ds) outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}') #Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our # config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file, # but then updates the config? That sounds bogus. if os.path.exists(outfile): continue atts = find_attributes(ds) existingatts = set(atts['Attribute_ID'].to_list()) attsIcanGet = cols.intersection(existingatts) df = query(attributes=attsIcanGet, filters={}, dataset=ds) df.to_csv(outfile, index=False, sep='\t') with open(complete_file, 'w') as outf: outf.write(f'Downloaded gene sets for {len(f)} data sets.')