Beispiel #1
0
def pull_ncbigene_labels_and_synonyms():
    #File format described here: https://ftp.ncbi.nih.gov/gene/DATA/README
    ifname = make_local_name('gene_info.gz', subpath='NCBIGene')
    labelname = make_local_name('labels', subpath='NCBIGene')
    synname = make_local_name('synonyms', subpath='NCBIGene')
    bad_gene_types = set(['biological-region', 'other', 'unknown'])
    with gzip.open(ifname,
                   'r') as inf, open(labelname,
                                     'w') as labelfile, open(synname,
                                                             'w') as synfile:
        h = inf.readline()
        for line in inf:
            sline = line.decode('utf-8')
            x = sline.strip().split('\t')
            gene_id = f'NCBIGene:{x[1]}'
            symbol = x[2]
            gene_type = x[9]
            if gene_type in bad_gene_types:
                continue
            labelfile.write(f'{gene_id}\t{symbol}\n')
            syns = set(x[4].split('|'))
            syns.add(symbol)
            description = x[8]
            syns.add(description)
            authoritative_symbol = x[10]
            syns.add(authoritative_symbol)
            authoritative_full_name = x[11]
            syns.add(authoritative_full_name)
            others = set(x[13].split('|'))
            syns.update(others)
            for syn in syns:
                synfile.write(f'{gene_id}\t{syn}\n')
Beispiel #2
0
def pull_hgnc_labels_and_synonyms(infile):
    with open(infile, 'r') as data:
        hgnc_json = json.load(data)
    lname = make_local_name('labels', subpath='HGNC')
    sname = make_local_name('synonyms', subpath='HGNC')
    with open(lname, 'w') as lfile, open(sname, 'w') as sfile:
        for gene in hgnc_json['response']['docs']:
            hgnc_id = gene['hgnc_id']
            symbol = gene['symbol']
            lfile.write(f'{hgnc_id}\t{symbol}\n')
            name = gene['name']
            sfile.write(
                f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n'
            )
            if 'alias_symbol' in gene:
                alias_symbols = gene['alias_symbol']
                for asym in alias_symbols:
                    sfile.write(
                        f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n'
                    )
            if 'alias_name' in gene:
                alias_names = gene['alias_name']
                for asym in alias_names:
                    sfile.write(
                        f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n'
                    )
Beispiel #3
0
def pull_umls():
    """Run through MRCONSO.RRF creating label and synonym files for UMLS and SNOMEDCT"""
    mrcon = os.path.join('input_data', 'MRCONSO.RRF')
    rows = defaultdict(list)
    priority = read_umls_priority()
    snomed_label_name = make_local_name('labels', subpath='SNOMEDCT')
    snomed_syn_name = make_local_name('synonyms', subpath='SNOMEDCT')
    with open(mrcon,
              'r') as inf, open(snomed_label_name,
                                'w') as snolabels, open(snomed_syn_name,
                                                        'w') as snosyns:
        for line in inf:
            x = line.strip().split('|')
            cui = x[0]
            lang = x[1]
            #Only keep english terms
            if lang != 'ENG':
                continue
            #only keep unsuppressed rows
            suppress = x[16]
            if suppress == 'O' or suppress == 'E':
                continue
            source = x[11]
            termtype = x[12]
            term = x[14]
            #While we're here, if this thing is snomed, lets get it
            if source == 'SNOMEDCT_US':
                snomed_id = f'SNOMEDCT:{x[15]}'
                if termtype == 'PT':
                    snolabels.write(f'{snomed_id}\t{term}\n')
                snosyns.write(
                    f'{snomed_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{term}\n'
                )
            #UMLS is a collection of sources. They pick one of the names from these sources for a concept,
            # and that's based on a priority that they define. Here we get the priority for terms so we
            # can get the right one for the label
            pkey = (source, termtype, suppress)
            try:
                pri = priority[pkey]
            except:
                #print(pkey)
                pri = 1000000
            rows[cui].append((pri, term, line))
    lname = make_local_name('labels', subpath='UMLS')
    sname = make_local_name('synonyms', subpath='UMLS')
    with open(lname, 'w') as labels, open(sname, 'w') as synonyms:
        for cui, crows in rows.items():
            crows.sort()
            labels.write(f'{UMLS}:{cui}\t{crows[0][1]}\n')
            syns = set([crow[1] for crow in crows])
            for s in syns:
                synonyms.write(
                    f'{UMLS}:{cui}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{s}\n'
                )
Beispiel #4
0
def pull_prot(which,refresh):
    #swissname = pull_via_ftplib('ftp.uniprot.org','/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',decompress_data=True,outfilename=f'uniprot_{which}.fasta')
    if refresh:
        swissname = pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz')
    else:
        swissname = make_local_name(f'uniprot_{which}.fasta')
    swissprot_labels = {}
    nlines = 0
    maxn = 1000
    with open(swissname,'r') as inf:
        for line in inf:
            nlines += 1
            if line.startswith('>'):
                #example fasta line:
                #>sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1
                x = line.split('|')
                uniprotid = f'UniProtKB:{x[1]}'
                name = x[2].split(' OS=')[0]
                swissprot_labels[uniprotid] = f'{name} ({which})'
            #if nlines > maxn:
            #    break
    print('numlines',nlines)
    print('nl',len(swissprot_labels))
    swissies = [ (k,) for k in swissprot_labels.keys() ]
    print('s',len(swissies))
    return swissies, swissprot_labels
Beispiel #5
0
def pull_prots(refresh_swiss=False,refresh_trembl=False):
    swiss,labels = pull_prot('sprot',refresh_swiss)
    fname = make_local_name('labels', subpath='UNIPROTKB')
    with open(fname,'w') as synonyms:
        for k,v in labels.items():
            synonyms.write(f'{k}\t{v}\n')
        tremb,tlabels = pull_prot('trembl',refresh_trembl)
        for k,v in tlabels.items():
            synonyms.write(f'{k}\t{v}\n')
Beispiel #6
0
def pull_pubchem_labels():
    print('LABEL PUBCHEM')
    f_name =  'CID-Title.gz'
    cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name)
    fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND')
    with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf:
        for line in inf:
            x = line.strip().split('\t')
            outf.write(f'PUBCHEM.COMPOUND:{x[0]}\t{x[1]}\n')
Beispiel #7
0
 def __init__(self):
     ifname = make_local_name('mesh.nt', subpath='MESH')
     from datetime import datetime as dt
     print('loading mesh.nt')
     start = dt.now()
     self.m = pyoxigraph.MemoryStore()
     with open(ifname, 'rb') as inf:
         self.m.load(inf, 'application/n-triples')
     end = dt.now()
     print('loading complete')
     print(f'took {end-start}')
Beispiel #8
0
def pull_hgnc():
    data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json')
    hgnc_json = loads(data)
    lname = make_local_name('labels', subpath='HGNC')
    sname = make_local_name('synonyms', subpath='HGNC')
    with open(lname,'w') as lfile, open(sname,'w') as sfile:
        for gene in hgnc_json['response']['docs']:
            hgnc_id =gene['hgnc_id']
            symbol = gene['symbol']
            lfile.write(f'{hgnc_id}\t{symbol}\n')
            name = gene['name']
            sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n')
            if 'alias_symbol' in gene:
                alias_symbols = gene['alias_symbol']
                for asym in alias_symbols:
                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
            if 'alias_name' in gene:
                alias_names = gene['alias_name']
                for asym in alias_names:
                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
Beispiel #9
0
 def __init__(self):
     ifname = make_local_name('rhea.rdf', subpath='RHEA')
     from datetime import datetime as dt
     print('loading rhea')
     start = dt.now()
     self.m = pyoxigraph.MemoryStore()
     with open(ifname, 'rb') as inf:
         self.m.load(inf, 'application/rdf+xml')
     end = dt.now()
     print('loading complete')
     print(f'took {end-start}')
Beispiel #10
0
def pull_pubchem_synonyms():
    f_name = 'CID-Synonym-filtered.gz'
    sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name)
    fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND')
    with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf:
        for line in inf:
            x = line.strip().split('\t')
            if x[1].startswith('CHEBI'):
                continue
            if x[1].startswith('SCHEMBL'):
                continue
            outf.write(f'PUBCHEM.COMPOUND:{x[0]}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{x[1]}\n')
Beispiel #11
0
def readlabels(which):
    swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta')
    swissprot_labels = {}
    with open(swissname,'r') as inf:
        for line in inf:
            if line.startswith('>'):
                #example fasta line:
                #>sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1
                x = line.split('|')
                uniprotid = f'UniProtKB:{x[1]}'
                name = x[2].split(' OS=')[0]
                swissprot_labels[uniprotid] = f'{name} ({which})'
    return swissprot_labels
Beispiel #12
0
def pull_uber_labels(expected):
    uber = UberGraph()
    labels = uber.get_all_labels()
    ldict = defaultdict(set)
    for unit in labels:
        iri = unit['iri']
        p = iri.split(':')[0]
        ldict[p].add((unit['iri'], unit['label']))
    for p in ldict:
        if p not in ['http', 'ro'] and not p.startswith('t') and not '#' in p:
            fname = make_local_name('labels', subpath=p)
            with open(fname, 'w') as outf:
                for unit in ldict[p]:
                    outf.write(f'{unit[0]}\t{unit[1]}\n')
Beispiel #13
0
def pull_uber_synonyms(expected):
    uber = UberGraph()
    labels = uber.get_all_synonyms()
    ldict = defaultdict(set)
    for unit in labels:
        iri = unit[0]
        p = iri.split(':')[0]
        ldict[p].add(unit)
    #There are some of the ontologies that we don't get synonyms for.   But this makes snakemake unhappy so
    # we are going to make some zero-length files for it
    for p in expected:
        if p not in ['http', 'ro'] and not p.startswith('t') and not '#' in p:
            fname = make_local_name('synonyms', subpath=p)
            with open(fname, 'w') as outf:
                for unit in ldict[p]:
                    outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n')
Beispiel #14
0
    def pull_mesh_labels(self):
        s = """   PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
                PREFIX mesh: <http://id.nlm.nih.gov/mesh/>

                SELECT DISTINCT ?term ?label
                WHERE { ?term rdfs:label ?label }
                ORDER BY ?term
        """
        ofname = make_local_name('labels', subpath='MESH')
        qres = self.m.query(s)
        with open(ofname, 'w', encoding='utf8') as outf:
            for row in list(qres):
                iterm = str(row['term'])
                ilabel = str(row['label'])
                meshid = iterm[:-1].split('/')[-1]
                label = ilabel.strip().split('"')[1]
                outf.write(f'{MESH}:{meshid}\t{label}\n')
Beispiel #15
0
def pull_ensembl(complete_file):
    f = find_datasets()
    cols = set([
        "ensembl_gene_id", "ensembl_peptide_id", "description",
        "external_gene_name", "external_gene_source", "external_synonym",
        "chromosome_name", "source", "gene_biotype", "entrezgene_id",
        "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene',
        'wormbase_gene'
    ])
    for ds in f['Dataset_ID']:
        print(ds)
        outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
        #Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
        # config, and keep it up to date.  Maybe you could have a job that gets the datasets and writes a dataset file,
        # but then updates the config? That sounds bogus.
        if os.path.exists(outfile):
            continue
        atts = find_attributes(ds)
        existingatts = set(atts['Attribute_ID'].to_list())
        attsIcanGet = cols.intersection(existingatts)
        df = query(attributes=attsIcanGet, filters={}, dataset=ds)
        df.to_csv(outfile, index=False, sep='\t')
    with open(complete_file, 'w') as outf:
        outf.write(f'Downloaded gene sets for {len(f)} data sets.')