Ejemplo n.º 1
0
def pull_pantherfamily():
    outfile = f'{PANTHERFAMILY}/family.csv'
    pull_via_ftp(
        'ftp.pantherdb.org',
        '/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/',
        'PTHR16.0_human',
        outfilename=outfile)
Ejemplo n.º 2
0
def pull_pubchem():
    files = ['CID-MeSH', 'CID-Synonym-filtered.gz', 'CID-Title.gz']
    for f in files:
        outfile = f'PUBCHEM/{f}'
        pull_via_ftp('ftp.ncbi.nlm.nih.gov',
                     '/pubchem/Compound/Extras',
                     f,
                     outfilename=outfile)
Ejemplo n.º 3
0
def pull_ncbigene(filenames):
    remotedir = 'https://ftp.ncbi.nih.gov/gene/DATA/'
    for fn in filenames:
        pull_via_ftp('ftp.ncbi.nih.gov',
                     '/gene/DATA',
                     fn,
                     decompress_data=False,
                     outfilename=f'NCBIGene/{fn}')
Ejemplo n.º 4
0
def pull_chebi():
    pull_via_ftp('ftp.ebi.ac.uk',
                 '/pub/databases/chebi/SDF/',
                 'ChEBI_complete.sdf.gz',
                 decompress_data=True,
                 outfilename='CHEBI/ChEBI_complete.sdf')
    pull_via_ftp('ftp.ebi.ac.uk',
                 '/pub/databases/chebi/Flat_file_tab_delimited/',
                 'database_accession.tsv',
                 outfilename='CHEBI/database_accession.tsv')
Ejemplo n.º 5
0
def pull_chembl(moleculefilename):
    fname = get_latest_chembl_name()
    if not fname is None:
        # fname should be like chembl_28.0_molecule.ttl.gz
        #Pull via ftp is going to add the download_dir, so this is a hack until pull_via_ftp is nicer.
        oname = 'CHEMBLCOMPOUND/' + moleculefilename.split('/')[-1]
        pull_via_ftp('ftp.ebi.ac.uk',
                     '/pub/databases/chembl/ChEMBL-RDF/latest/',
                     fname,
                     decompress_data=True,
                     outfilename=oname)
        pull_via_ftp('ftp.ebi.ac.uk',
                     '/pub/databases/chembl/ChEMBL-RDF/latest/',
                     'cco.ttl.gz',
                     decompress_data=True,
                     outfilename='CHEMBL/cco.ttl')
Ejemplo n.º 6
0
def pull_pubchem_labels():
    print('LABEL PUBCHEM')
    f_name =  'CID-Title.gz'
    cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name)
    fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND')
    with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf:
        for line in inf:
            x = line.strip().split('\t')
            outf.write(f'PUBCHEM.COMPOUND:{x[0]}\t{x[1]}\n')
Ejemplo n.º 7
0
def test_pull_gzip_to_memory():
    """Pull a gzipped file into memory, decompressed"""
    data = pull_via_ftp('ftp.ncbi.nlm.nih.gov',
                        'gene/DATA/',
                        'gene_group.gz',
                        decompress_data=True)
    lines = data.split('\n')
    assert len(lines) > 1000
    assert lines[0].startswith('#tax_id')
Ejemplo n.º 8
0
def test_pull_gzip_to_compressed_file():
    """Pull a gzipped file into memory, decompressed"""
    ofname = 'test_gz.gz'
    outname = pull_via_ftp('ftp.ncbi.nlm.nih.gov',
                           'gene/DATA/',
                           'gene_group.gz',
                           outfilename=ofname)
    with gzip.open(outname, 'rt') as inf:
        lines = inf.read().split('\n')
    assert len(lines) > 1000
    assert lines[0].startswith('#tax_id')
Ejemplo n.º 9
0
def test_pull_text_to_file():
    """Pull a text file into local file"""
    ofname = 'test_text'
    outname = pull_via_ftp('ftp.ncbi.nlm.nih.gov',
                           'gene/DATA/',
                           'stopwords_gene',
                           outfilename=ofname)
    with open(outname, 'r') as inf:
        lines = inf.read().split('\n')
    assert len(lines) > 100
    assert lines[0] == 'a'
Ejemplo n.º 10
0
def pull_pubchem_synonyms():
    f_name = 'CID-Synonym-filtered.gz'
    sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name)
    fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND')
    with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf:
        for line in inf:
            x = line.strip().split('\t')
            if x[1].startswith('CHEBI'):
                continue
            if x[1].startswith('SCHEMBL'):
                continue
            outf.write(f'PUBCHEM.COMPOUND:{x[0]}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{x[1]}\n')
Ejemplo n.º 11
0
def pull_hgnc():
    data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json')
    hgnc_json = loads(data)
    lname = make_local_name('labels', subpath='HGNC')
    sname = make_local_name('synonyms', subpath='HGNC')
    with open(lname,'w') as lfile, open(sname,'w') as sfile:
        for gene in hgnc_json['response']['docs']:
            hgnc_id =gene['hgnc_id']
            symbol = gene['symbol']
            lfile.write(f'{hgnc_id}\t{symbol}\n')
            name = gene['name']
            sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n')
            if 'alias_symbol' in gene:
                alias_symbols = gene['alias_symbol']
                for asym in alias_symbols:
                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
            if 'alias_name' in gene:
                alias_names = gene['alias_name']
                for asym in alias_names:
                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
Ejemplo n.º 12
0
def pull_ncbitaxon():
    pull_via_ftp('ftp.ncbi.nih.gov',
                 '/pub/taxonomy',
                 'taxdump.tar.gz',
                 decompress_data=True,
                 outfilename=f'{NCBITAXON}/taxdump.tar')
Ejemplo n.º 13
0
def pull_hgnc():
    outfile = 'HGNC/hgnc_complete_set.json'
    pull_via_ftp('ftp.ebi.ac.uk',
                 '/pub/databases/genenames/new/json',
                 'hgnc_complete_set.json',
                 outfilename=outfile)
Ejemplo n.º 14
0
def pull_hgncfamily():
    outfile = f'{HGNCFAMILY}/family.csv'
    pull_via_ftp('ftp.ebi.ac.uk',
                 '/pub/databases/genenames/new/csv/genefamily_db_tables',
                 'family.csv',
                 outfilename=outfile)
Ejemplo n.º 15
0
def pull_mesh():
    pull_via_ftp('ftp.nlm.nih.gov',
                 '/online/mesh/rdf',
                 'mesh.nt.gz',
                 decompress_data=True,
                 outfilename='MESH/mesh.nt')
Ejemplo n.º 16
0
def test_pull_text_to_memory():
    """Pull a text file into memory so it will be usable"""
    data = pull_via_ftp('ftp.ncbi.nlm.nih.gov', 'gene/DATA/', 'stopwords_gene')
    lines = data.split('\n')
    assert len(lines) > 100
    assert lines[0] == 'a'