Esempio n. 1
0
 def download(self, url, destination):
     """
     Downloads the specified url and saves the result to the specified
     file.
     """
     fileDownloader = utils.HttpFileDownloader(url, destination)
     fileDownloader.download()
    def __init__(self, inputDirectory, outputDirectory):
        """
        Converts human readable dataset from compliance repository,
        and translates it into a reference-server readable filesystem
        with binary files.
        :param inputDirectory: location of
            the human readable compliance dataset
        :param outputDirectory: location of
            the file hierarchy suitable for deploying on the reference server
        """
        self.inputDirectory = inputDirectory
        self.outputDirectory = outputDirectory
        self.repoPath = os.path.join(outputDirectory, "repo.db")
        self.tempdir = None

        # If no input directory is specified download from GitHub
        if inputDirectory is None:
            utils.log("Downloading test data...")
            self.tempdir = tempfile.mkdtemp()
            assert(os.path.exists(self.tempdir))
            url = "https://github.com/ga4gh/compliance/archive/master.zip"
            filePath = os.path.join(self.tempdir, 'compliance-master.zip')
            downloader = utils.HttpFileDownloader(url, filePath)
            downloader.download()
            utils.log("Extracting test data...")
            with zipfile.ZipFile(filePath, "r") as z:
                z.extractall(self.tempdir)
            self.inputDirectory = os.path.join(
                self.tempdir, 'compliance-master', 'test-data')
        repo = datarepo.SqlDataRepository(self.repoPath)
        self.repo = repo
Esempio n. 3
0
    def __init__(self, inputDirectory, outputDirectory, force):
        """
        Converts human readable dataset from compliance repository,
        and translates it into a reference-server readable filesystem
        with binary files.
        :param inputDirectory: location of
            the human readable compliance dataset
        :param outputDirectory: location of
            the file hierarchy suitable for deploying on the reference server
        """
        self.inputDirectory = inputDirectory
        self.outputDirectory = outputDirectory
        self.repoPath = os.path.abspath(
            os.path.join(outputDirectory, "registry.db"))
        self.tempdir = None

        if os.path.exists(self.outputDirectory):
            if force:
                utils.log("Removing existing output directory at '{}'".format(
                    self.outputDirectory))
                shutil.rmtree(self.outputDirectory)
            else:
                utils.log("Output directory '{}' already exists".format(
                    self.outputDirectory))
                utils.log("Please specify an output path that does not exist")
                utils.log("Exiting...")
                exit(1)

        # If no input directory is specified download from GitHub
        if inputDirectory is None:
            utils.log("Downloading test data...")
            self.tempdir = tempfile.mkdtemp()
            assert (os.path.exists(self.tempdir))
            url = "https://github.com/ga4gh/compliance/archive/master.zip"
            filePath = os.path.join(self.tempdir, 'compliance-master.zip')
            downloader = utils.HttpFileDownloader(url, filePath)
            downloader.download()
            utils.log("Extracting test data...")
            with zipfile.ZipFile(filePath, "r") as z:
                z.extractall(self.tempdir)
            self.inputDirectory = os.path.join(self.tempdir,
                                               'compliance-master',
                                               'test-data')
        repo = datarepo.SqlDataRepository(self.repoPath)
        self.repo = repo
Esempio n. 4
0
 def downloadFastas(self):
     dirList = [self.dirName, 'references']
     mkdirAndChdirList(dirList)
     # Assemble reference set metadata
     referenceSetMetadata = {
         "assemblyId": 'TODO',
         "description": 'TODO',
         "isDerived": False,
         "ncbiTaxonId": 9606,
         "sourceAccessions": [],
         "sourceUri": 'TODO',
     }
     referenceSetMetadataFilename = "{}.json".format(self.referenceSetName)
     dumpDictToFileAsJson(referenceSetMetadata,
                          referenceSetMetadataFilename)
     # Download chromosomes
     mkdirAndChdirList([self.referenceSetName])
     cleanDir()
     baseUrl = 'http://www.ebi.ac.uk/ena/data/view/'
     for chromosome in self.chromosomes:
         accession = self.accessions[chromosome]
         path = os.path.join(baseUrl, accession)
         maxPos = self.chromMinMax.getMaxPos(chromosome)
         minPos = 0
         if self.excludeReferenceMin:
             minPos = self.chromMinMax.getMinPos(chromosome)
         args = urllib.urlencode({
             'display': 'fasta',
             'range': '{}-{}'.format(minPos, maxPos)
         })
         url = '{}%26{}'.format(path, args)
         tempFileName = '{}.fa.temp'.format(chromosome)
         fileName = '{}.fa'.format(chromosome)
         downloader = utils.HttpFileDownloader(url, tempFileName)
         downloader.download()
         # We need to replace the header on the downloaded FASTA
         with open(tempFileName, "r") as inFasta,\
                 open(fileName, "w") as outFasta:
             # Write the new header
             print(">{}".format(chromosome), file=outFasta)
             inFasta.readline()
             for line in inFasta:
                 print(line, file=outFasta, end="")
         os.unlink(tempFileName)
         utils.log("Compressing {}".format(fileName))
         utils.runCommand("bgzip {}".format(fileName))
         compressedFileName = fileName + '.gz'
         utils.log("Indexing {}".format(compressedFileName))
         utils.runCommand("samtools faidx {}".format(compressedFileName))
         # Assemble the metadata.
         metadata = {
             "md5checksum": getReferenceChecksum(compressedFileName),
             "sourceUri": url,
             "ncbiTaxonId": 9606,
             "isDerived": False,
             "sourceDivergence": None,
             "sourceAccessions": [accession + ".subset"],
         }
         metadataFilename = "{}.json".format(chromosome)
         dumpDictToFileAsJson(metadata, metadataFilename)
     escapeDir(3)
Esempio n. 5
0
    def __init__(self, inputDirectory, outputDirectory):
        """
        Converts human readable dataset from compliance repository,
        and translates it into a reference-server readable filesystem
        with binary files.
        :param inputDirectory: location of
            the human readable compliance dataset
        :param outputDirectory: location of
            the file hierarchy suitable for deploying on the reference server
        """
        self.inputDirectory = inputDirectory
        self.outputDirectory = outputDirectory
        self.tempdir = None

        # If no input directory is specified download from GitHub
        if inputDirectory is None:
            utils.log("Downloading test data...")
            self.tempdir = tempfile.mkdtemp()
            assert(os.path.exists(self.tempdir))
            url = "https://github.com/ga4gh/compliance/archive/master.zip"
            filePath = os.path.join(self.tempdir, 'compliance-master.zip')
            downloader = utils.HttpFileDownloader(url, filePath)
            downloader.download()
            utils.log("Extracting test data...")
            with zipfile.ZipFile(filePath, "r") as z:
                z.extractall(self.tempdir)
            self.inputDirectory = os.path.join(
                self.tempdir, 'compliance-master', 'test-data')

        # get all the reference files (they'll be the ones with .fa extension)
        self.referenceFiles = map(
            os.path.basename, glob.glob(
                os.path.join(self.inputDirectory, "*.fa")))

        self.refsetsDirectory = os.path.join(
            self.outputDirectory, "referenceSets")
        self.hg37Directory = os.path.join(self.refsetsDirectory, "hg37")

        # datasets
        self.datasetsDirectory = os.path.join(self.outputDirectory, "datasets")

        readFiles = map(os.path.basename, glob.glob(
            os.path.join(self.inputDirectory, "*.sam")))
        variantFiles = map(os.path.basename, glob.glob(
            os.path.join(self.inputDirectory, "*.vcf")))
        sequenceAnnotationFiles = map(os.path.basename, glob.glob(
            os.path.join(self.inputDirectory, "*.gff3")))

        self.datasets = [d for d in set(
            [p.split('_')[0] for p in (readFiles +
                                       variantFiles +
                                       sequenceAnnotationFiles)])]
        self.datasetDirs = [os.path.join(
            self.outputDirectory, ds) for ds in self.datasets]

        # Create maps of arrays of files based on dataset.
        self.datasetReads = dict()
        self.datasetVariants = dict()
        self.datasetSequenceAnnotations = dict()

        for ds in self.datasets:
            self.datasetReads[ds] = [r for r in readFiles if r.startswith(ds)]
            self.datasetSequenceAnnotations[ds] = [sa for sa in (
                sequenceAnnotationFiles) if sa.startswith(ds)]

            # Variants themselves are split into groups,
            # based on second part of the _ split:
            self.datasetVariants[ds] = dict()
            # only those variants inside this dataset
            dsvlist = [v for v in variantFiles if v.startswith(ds)]
            # create nested dictionary based on group belonging
            for dsv in dsvlist:
                dsvGroup = dsv.split('_')[1]
                self.datasetVariants[ds][dsvGroup] = \
                    self.datasetVariants[ds].get(dsvGroup, []) + [dsv]
Esempio n. 6
0
def downloadData():
    url = ("https://github.com/ga4gh/server/releases/"
           "download/data/ga4gh-example-data-v3.2.tar")
    fileDownloader = utils.HttpFileDownloader(url, tarballPath)
    fileDownloader.download()
    utils.log("Downloading finished")
Esempio n. 7
0
def downloadData():
    url = "http://www.well.ox.ac.uk/~jk/ga4gh-example-data.tar"
    fileDownloader = utils.HttpFileDownloader(url, tarballPath)
    fileDownloader.download()
    utils.log("Downloading finished")