Ejemplo n.º 1
0
    def download_archived_releases(self, path, taxids=None):
        archive_files = get_list_of_archived_releases()

        for release, files in archive_files.items():
            # only download if accession2geneid and removed-records are available
            if taxids:
                try:
                    download_and_filter_data_file(files['accession2geneid'],
                                                  path, taxids)
                except KeyError:
                    pass
                try:
                    download_and_filter_data_file(files['removed-records'],
                                                  path, taxids)
                except KeyError:
                    pass
            else:
                try:
                    downloader.download_file_to_dir(files['accession2geneid'],
                                                    path)
                except KeyError:
                    pass
                try:
                    downloader.download_file_to_dir(files['removed-records'],
                                                    path)
                except KeyError:
                    pass
Ejemplo n.º 2
0
    def download_function(self, instance):
        files = [
            'ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt',
        ]

        for file in files:
            downloader.download_file_to_dir(file,
                                            instance.process_instance_dir)
Ejemplo n.º 3
0
 def _download_latest_taxonomic_division(self, instance):
     # download division files
     for division in ['human', 'rodents']:
         downloader.download_file_to_dir(
             '{}/{}knowledgebase/taxonomic_divisions/uniprot_sprot_{}.dat.gz'
             .format(self.UNIPROT_BASEURL, self.UNIPROT_CURRENT_BASEPATH,
                     division), instance.process_instance_dir)
         downloader.download_file_to_dir(
             '{}/{}knowledgebase/taxonomic_divisions/uniprot_trembl_{}.dat.gz'
             .format(self.UNIPROT_BASEURL, self.UNIPROT_CURRENT_BASEPATH,
                     division), instance.process_instance_dir)
Ejemplo n.º 4
0
    def download_function(self, instance):
        files = ['https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt',
                 'https://reactome.org/download/current/ChEBI2Reactome_All_Levels.txt',
                 'https://reactome.org/download/current/Ensembl2Reactome_All_Levels.txt',
                 'https://reactome.org/download/current/miRBase2Reactome_All_Levels.txt',
                 'https://reactome.org/download/current/NCBI2Reactome_All_Levels.txt',
                 'https://reactome.org/download/current/ReactomePathways.txt',
                 'https://reactome.org/download/current/ReactomePathwaysRelation.txt']

        for f in files:
            downloader.download_file_to_dir(f, instance.process_instance_dir)
Ejemplo n.º 5
0
    def download_function(self, instance):
        files = [
            ('https://www.swisslipids.org/api/file.php?cas=download_files&file=enzymes.tsv', 'enzymes.tsv.gz'),
            ('https://www.swisslipids.org/api/file.php?cas=download_files&file=tissues.tsv','tissues.tsv.gz'),
            ('https://www.swisslipids.org/api/file.php?cas=download_files&file=go.tsv', 'go.tsv'),
            ('https://www.swisslipids.org/api/file.php?cas=download_files&file=evidences.tsv', 'evidences.tsv.gz'),
            ('https://www.swisslipids.org/api/file.php?cas=download_files&file=lipids2uniprot.tsv', 'lipids2uniprot.tsv.gz'),
            ('https://www.swisslipids.org/api/file.php?cas=download_files&file=lipids.tsv', 'lipids.tsv.gz')
        ]

        for file, name in files:
            downloader.download_file_to_dir(file, instance.process_instance_dir, filename=name)
Ejemplo n.º 6
0
    def download_function(self, instance):
        """
        Download latest availbale version.

        The full table of ontologies can be downlaode from http://www.obofoundry.org/registry/ontologies.yml
        """
        log.debug("Download OBO Foundry")

        # download ontology table
        for f in ['http://www.obofoundry.org/registry/ontologies.jsonld',
                  'http://www.obofoundry.org/registry/ontologies.yml']:
            downloader.download_file_to_dir(f, instance.process_instance_dir)

        self.download_all_obo(instance.process_instance_dir)
Ejemplo n.º 7
0
    def download_function(self, instance):
        files = [
            'ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.obo',
            'ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz'
        ]

        for file in files:
            downloader.download_file_to_dir(file,
                                            instance.process_instance_dir)

        downloader.download_directory_from_ftp(
            'ftp://ftp.ebi.ac.uk',
            '/pub/databases/chebi/Flat_file_tab_delimited/',
            os.path.join(instance.process_instance_dir, 'tables'))
Ejemplo n.º 8
0
    def download_function(self, instance):
        files = [
            'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
            'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz',
            'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz',
            'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2ensembl.gz',
            'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2accession.gz',
            'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_orthologs.gz',
            'ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_history.gz'
        ]

        for file in files:
            downloader.download_file_to_dir(file,
                                            instance.process_instance_dir)
Ejemplo n.º 9
0
    def download_function(self, instance, version):
        """
        Download latest version.
        """

        files = [
            'https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDD.xlsx',
            'https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDD.xlsx',
            'https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt',
            'https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt',
            'https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz'
            # 'https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz',
            # 'https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz'
        ]

        for f in files:
            downloader.download_file_to_dir(f, instance.process_instance_dir)
Ejemplo n.º 10
0
    def download_all_obo(self, target_dir):
        """
        Download obo files of all active ontologies.

        :param target_dir: The target dir (i.e. instance process dir)
        """

        for ontology in self.load_ontology_table()['ontologies']:
            if ontology['activity_status'] == 'active':
                ontology_id = ontology['id']
                for product in ontology['products']:
                    if '.obo' in product['id'] or '.owl' in product['id']:
                        ontology_target_dir = os.path.join(target_dir, ontology_id)
                        try:
                            downloader.download_file_to_dir(product['ontology_purl'], ontology_target_dir)
                        except Exception as e:
                            log.error(f"Can't download {product['ontology_purl']}, continue with next files.")
                            log.error(e)
Ejemplo n.º 11
0
 def download_function(self, instance, version):
     """
     Download latest version.
     """
     #for i in range(3, 13):
     for i in range(3, 13):
         download_url = BASE_URL.format(i)
         zip_file_path = downloader.download_file_to_dir(
             download_url, instance.process_instance_dir)
         unzip(zip_file_path)
Ejemplo n.º 12
0
    def download_function(self, instance, version):

        files = [
            'https://hmdb.ca/system/downloads/current/structures.zip',
            'https://hmdb.ca/system/downloads/current/hmdb_metabolites.zip',
            'https://hmdb.ca/system/downloads/current/hmdb_proteins.zip'
        ]

        for file in files:
            downloaded_zip = downloader.download_file_to_dir(
                file, instance.process_instance_dir)
            unzip(downloaded_zip)
Ejemplo n.º 13
0
    def download_function(self, instance, version, taxids=None):
        """
        Download the catalogue file containing all RefSeq ids and the gene transcript mapping file.
        """

        self.download_archived_releases(instance.process_instance_dir, taxids)

        catalog_path = 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/'

        catalog = posixpath.join(catalog_path,
                                 f'RefSeq-release{version}.catalog.gz')
        accession2geneid = posixpath.join(
            catalog_path, f'release{version}.accession2geneid.gz')
        removed_records = f'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release{version}.removed-records.gz'

        if taxids:
            download_and_filter_data_file(catalog,
                                          instance.process_instance_dir,
                                          taxids)
            download_and_filter_data_file(accession2geneid,
                                          instance.process_instance_dir,
                                          taxids)
            download_and_filter_data_file(removed_records,
                                          instance.process_instance_dir,
                                          taxids)
        else:
            downloader.download_file_to_dir(catalog,
                                            instance.process_instance_dir)
            downloader.download_file_to_dir(accession2geneid,
                                            instance.process_instance_dir)
            downloader.download_file_to_dir(removed_records,
                                            instance.process_instance_dir)
Ejemplo n.º 14
0
    def download_function(self, instance, version):
        """
        Download latest version.
        """

        downloader.download_file_to_dir(
            'ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/xmlmesh/desc2020.xml',
            instance.process_instance_dir)
        downloader.download_file_to_dir(
            'ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/xmlmesh/qual2020.xml',
            instance.process_instance_dir)
        downloader.download_file_to_dir(
            'ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/xmlmesh/supp2020.xml',
            instance.process_instance_dir)
Ejemplo n.º 15
0
def download_and_filter_data_file(url: str, path: str,
                                  taxids: List[str]) -> str:
    """
    Most RefSeq data files start with the Taxonomy ID. This function downloads a gzipped
    data file, filters all records for a list of Taxonomy IDs and deletes the original file.

    :param url: URL to download.
    :param path: Local download path.
    :param taxids: List of Taxonomy IDs to filter.
    :return: Path of filtered file.
    """
    taxids = set(taxids)

    downloaded_file = downloader.download_file_to_dir(url, path)

    original_filename = downloaded_file.split('/')[-1]
    downloaded_path = downloaded_file.rsplit('/', 1)[0]

    # release10.removed-records.gz -> release10.removed-records_filtered.gz
    new_filename = f"{original_filename.rsplit('.', 1)[0]}.filtered.gz"
    new_filepath = os.path.join(downloaded_path, new_filename)

    with gzip.open(new_filepath, 'wt') as output:

        with gzip.open(downloaded_file, 'rt') as f:
            try:
                for l in f:
                    this_taxid = l.strip().split('\t')[0]
                    if this_taxid in taxids:
                        output.write(l)
            except zlib.error as e:
                log.error(e)
                log.error(
                    f"File {downloaded_file} not readable, corrupted download."
                )

    os.remove(downloaded_file)
    return new_filepath
Ejemplo n.º 16
0
 def download_function(self, instance, version):
     file = FILE_URL_FORMAT_VERSION.format(str(version))
     downloader.download_file_to_dir(file, instance.process_instance_dir)
Ejemplo n.º 17
0
 def download_function(self, instance, version):
     downloader.download_file_to_dir(
         'ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data',
         instance.process_instance_dir)
Ejemplo n.º 18
0
 def download_function(self, instance, version):
     downloader.download_file_to_dir(VERSION_2_URL[str(version)],
                                     instance.process_instance_dir)
Ejemplo n.º 19
0
    def download_function(self, instance, version):
        files = ['https://lncipedia.org/downloads/lncipedia_5_2/full-database/lncipedia_5_2_hg38.gff',
                 'https://lncipedia.org/downloads/lncipedia_5_2/high-confidence-set/lncipedia_5_2_hc_hg38.gff']

        for f in files:
            download_file_to_dir(f, instance.process_instance_dir)
Ejemplo n.º 20
0
 def download_function(self, instance):
     zip_file = downloader.download_file_to_dir('ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip',
                                                instance.process_instance_dir)
     # unpack zip file
     unzip(zip_file)