Beispiel #1
0
    def _download_file(self, url, filename, compress=False, timeout=30):
        """ Download a file to the resources folder.

        Download data from `url`, save as `filename`, and optionally compress with gzip.

        Parameters
        ----------
        url : str
            URL to download data from
        filename : str
            name of file to save; if compress, ensure '.gz' is appended
        compress : bool
            compress with gzip
        timeout : int
            seconds for timeout of download request

        Returns
        -------
        str
            path to downloaded file, empty str if error
        """
        if compress and filename[-3:] != ".gz":
            filename += ".gz"

        destination = os.path.join(self._resources_dir, filename)

        if not create_dir(os.path.relpath(os.path.dirname(destination))):
            return ""

        if not os.path.exists(destination):
            try:
                # get file if it hasn't already been downloaded
                # http://stackoverflow.com/a/7244263
                with urllib.request.urlopen(
                        url, timeout=timeout) as response, atomic_write(
                            destination, mode="wb") as f:
                    self._print_download_msg(destination)
                    data = response.read()  # a `bytes` object

                    if compress:
                        self._write_data_to_gzip(f, data)
                    else:
                        f.write(data)
            except urllib.error.URLError as err:
                logger.warning(err)
                destination = ""
                # try HTTP if an FTP error occurred
                if "ftp://" in url:
                    destination = self._download_file(
                        url.replace("ftp://", "http://"),
                        filename,
                        compress=compress,
                        timeout=timeout,
                    )
            except socket.timeout:
                logger.warning(f"Timeout downloading {url}")
                destination = ""

        return destination
Beispiel #2
0
    def _setup_gsa_test():
        # reset resource if already loaded
        temp = SNPs()
        temp._resources._gsa_resources = {}

        create_dir("resources")

        with open("tests/resources/gsa_rsid_map.txt", "rb") as f_in:
            with atomic_write("resources/gsa_rsid_map.txt.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        with open("tests/resources/gsa_chrpos_map.txt", "rb") as f_in:
            with atomic_write("resources/gsa_chrpos_map.txt.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)
Beispiel #3
0
    def _get_path_assembly_mapping_data(self,
                                        source_assembly,
                                        target_assembly,
                                        retries=10):
        """ Get local path to assembly mapping data, downloading if necessary.

        Parameters
        ----------
        source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
            assembly to remap from
        target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
            assembly to remap to
        retries : int
            number of retries per chromosome to download assembly mapping data

        Returns
        -------
        str
            path to <source_assembly>_<target_assembly>.tar.gz

        References
        ----------
        1. Ensembl, Assembly Information Endpoint,
           https://rest.ensembl.org/documentation/info/assembly_info
        2. Ensembl, Assembly Map Endpoint,
           http://rest.ensembl.org/documentation/info/assembly_map

        """

        if not create_dir(self._resources_dir):
            return ""

        chroms = [str(i) for i in range(1, 23)]
        chroms.extend(["X", "Y", "MT"])

        assembly_mapping_data = source_assembly + "_" + target_assembly
        destination = os.path.join(self._resources_dir,
                                   assembly_mapping_data + ".tar.gz")

        if not os.path.exists(destination):
            logger.info(f"Downloading {os.path.relpath(destination)}")

            self._download_assembly_mapping_data(destination, chroms,
                                                 source_assembly,
                                                 target_assembly, retries)

        return destination
Beispiel #4
0
""" Get a file from the openSNP datadump for debugging. """

import os

from atomicwrites import atomic_write

from snps.resources import Resources
from snps.utils import create_dir

OUTPUT_DIR = "output"
FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt"

if __name__ == "__main__":
    # create output directory for this example
    create_dir(OUTPUT_DIR)

    # assume script is being run from examples dir
    r = Resources(resources_dir="../../resources")

    with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f:
        f.write(r.load_opensnp_datadump_file(FILE))
Beispiel #5
0
    def _find_shared_dna_output_helper(
        self,
        individuals,
        one_chrom_shared_dna,
        two_chrom_shared_dna,
        one_chrom_shared_genes,
        two_chrom_shared_genes,
    ):
        cytobands = self._resources.get_cytoBand_hg19()

        individuals_filename = ""
        individuals_plot_title = ""

        for individual in individuals:
            individuals_filename += individual.get_var_name() + "_"
            individuals_plot_title += individual.name + " / "

        individuals_filename = individuals_filename[:-1]
        individuals_plot_title = individuals_plot_title[:-3]

        if create_dir(self._output_dir):
            plot_chromosomes(
                one_chrom_shared_dna,
                two_chrom_shared_dna,
                cytobands,
                os.path.join(
                    self._output_dir, "shared_dna_{}.png".format(individuals_filename)
                ),
                "{} shared DNA".format(individuals_plot_title),
                37,
            )

        if len(one_chrom_shared_dna) > 0:
            file = "shared_dna_one_chrom_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                one_chrom_shared_dna,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
                float_format="%.2f",
            )

        if len(two_chrom_shared_dna) > 0:
            file = "shared_dna_two_chroms_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                two_chrom_shared_dna,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
                float_format="%.2f",
            )

        if len(one_chrom_shared_genes) > 0:
            file = "shared_genes_one_chrom_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                one_chrom_shared_genes,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
            )

        if len(two_chrom_shared_genes) > 0:
            file = "shared_genes_two_chroms_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                two_chrom_shared_genes,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
            )
Beispiel #6
0
def main():
    logger.info("start")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    filenames = [
        filename
        for filename in filenames
        if "readme" not in filename and "phenotype" not in filename
    ]

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)
    tasks = [{"file": filenames[i]} for i in samples]

    # run tasks; results is a list of dicts
    results = p(load_file, tasks)

    # get results from `load_file` where `count` was non-zero
    rows = [item for item in results if "msg" not in item]

    df = pd.DataFrame(
        rows,
        columns=["file", "source", "build", "build_detected", "chromosomes", "count"],
    )

    save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")

    # log parsing statistics
    file_count = len(filenames)
    logger.info(f"{file_count} files in the openSNP datadump")
    logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed")
    logger.info(
        f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed"
    )

    # extract files from the datadump where `load_file` returned a message
    if EXTRACT_FILES:
        # group files with same message (e.g., {"some message": ["file1", "file2"], ...})
        d = {}
        for result in results:
            if "msg" in result:
                if result["msg"] in d:
                    d[result["msg"]].append(result["file"])
                else:
                    d[result["msg"]] = [result["file"]]

        # add messages / file filters as necessary...
        d["build not detected"] = list(df.loc[~df.build_detected].file.values)

        # extract files that have messages for debugging
        for msg, files in d.items():
            if len(files) == 0:
                continue

            # create a directory for each message (prefix indicates number of files)
            path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}")
            create_dir(path)
            # save each file with message into created directory
            for filename in files:
                with atomic_write(os.path.join(path, filename), mode="wb") as f:
                    f.write(r.load_opensnp_datadump_file(filename))

    logger.info("stop")
Beispiel #7
0
    def _get_path_assembly_mapping_data(self,
                                        source_assembly,
                                        target_assembly,
                                        retries=10):
        """ Get local path to assembly mapping data, downloading if necessary.

        Parameters
        ----------
        source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
            assembly to remap from
        target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
            assembly to remap to
        retries : int
            number of retries per chromosome to download assembly mapping data

        Returns
        -------
        str
            path to <source_assembly>_<target_assembly>.tar.gz

        References
        ----------
        1. Ensembl, Assembly Information Endpoint,
           https://rest.ensembl.org/documentation/info/assembly_info
        2. Ensembl, Assembly Map Endpoint,
           http://rest.ensembl.org/documentation/info/assembly_map

        """

        if not create_dir(self._resources_dir):
            return ""

        chroms = [
            "1",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
            "10",
            "11",
            "12",
            "13",
            "14",
            "15",
            "16",
            "17",
            "18",
            "19",
            "20",
            "21",
            "22",
            "X",
            "Y",
            "MT",
        ]

        assembly_mapping_data = source_assembly + "_" + target_assembly
        destination = os.path.join(self._resources_dir,
                                   assembly_mapping_data + ".tar.gz")

        if not os.path.exists(destination):
            logger.info("Downloading {}".format(os.path.relpath(destination)))

            self._download_assembly_mapping_data(destination, chroms,
                                                 source_assembly,
                                                 target_assembly, retries)

        return destination