def download(self, url, destination): """ Downloads the specified url and saves the result to the specified file. """ fileDownloader = utils.HttpFileDownloader(url, destination) fileDownloader.download()
def __init__(self, inputDirectory, outputDirectory): """ Converts human readable dataset from compliance repository, and translates it into a reference-server readable filesystem with binary files. :param inputDirectory: location of the human readable compliance dataset :param outputDirectory: location of the file hierarchy suitable for deploying on the reference server """ self.inputDirectory = inputDirectory self.outputDirectory = outputDirectory self.repoPath = os.path.join(outputDirectory, "repo.db") self.tempdir = None # If no input directory is specified download from GitHub if inputDirectory is None: utils.log("Downloading test data...") self.tempdir = tempfile.mkdtemp() assert(os.path.exists(self.tempdir)) url = "https://github.com/ga4gh/compliance/archive/master.zip" filePath = os.path.join(self.tempdir, 'compliance-master.zip') downloader = utils.HttpFileDownloader(url, filePath) downloader.download() utils.log("Extracting test data...") with zipfile.ZipFile(filePath, "r") as z: z.extractall(self.tempdir) self.inputDirectory = os.path.join( self.tempdir, 'compliance-master', 'test-data') repo = datarepo.SqlDataRepository(self.repoPath) self.repo = repo
def __init__(self, inputDirectory, outputDirectory, force): """ Converts human readable dataset from compliance repository, and translates it into a reference-server readable filesystem with binary files. :param inputDirectory: location of the human readable compliance dataset :param outputDirectory: location of the file hierarchy suitable for deploying on the reference server """ self.inputDirectory = inputDirectory self.outputDirectory = outputDirectory self.repoPath = os.path.abspath( os.path.join(outputDirectory, "registry.db")) self.tempdir = None if os.path.exists(self.outputDirectory): if force: utils.log("Removing existing output directory at '{}'".format( self.outputDirectory)) shutil.rmtree(self.outputDirectory) else: utils.log("Output directory '{}' already exists".format( self.outputDirectory)) utils.log("Please specify an output path that does not exist") utils.log("Exiting...") exit(1) # If no input directory is specified download from GitHub if inputDirectory is None: utils.log("Downloading test data...") self.tempdir = tempfile.mkdtemp() assert (os.path.exists(self.tempdir)) url = "https://github.com/ga4gh/compliance/archive/master.zip" filePath = os.path.join(self.tempdir, 'compliance-master.zip') downloader = utils.HttpFileDownloader(url, filePath) downloader.download() utils.log("Extracting test data...") with zipfile.ZipFile(filePath, "r") as z: z.extractall(self.tempdir) self.inputDirectory = os.path.join(self.tempdir, 'compliance-master', 'test-data') repo = datarepo.SqlDataRepository(self.repoPath) self.repo = repo
def downloadFastas(self): dirList = [self.dirName, 'references'] mkdirAndChdirList(dirList) # Assemble reference set metadata referenceSetMetadata = { "assemblyId": 'TODO', "description": 'TODO', "isDerived": False, "ncbiTaxonId": 9606, "sourceAccessions": [], "sourceUri": 'TODO', } referenceSetMetadataFilename = "{}.json".format(self.referenceSetName) dumpDictToFileAsJson(referenceSetMetadata, referenceSetMetadataFilename) # Download chromosomes mkdirAndChdirList([self.referenceSetName]) cleanDir() baseUrl = 'http://www.ebi.ac.uk/ena/data/view/' for chromosome in self.chromosomes: accession = self.accessions[chromosome] path = os.path.join(baseUrl, accession) maxPos = self.chromMinMax.getMaxPos(chromosome) minPos = 0 if self.excludeReferenceMin: minPos = self.chromMinMax.getMinPos(chromosome) args = urllib.urlencode({ 'display': 'fasta', 'range': '{}-{}'.format(minPos, maxPos) }) url = '{}%26{}'.format(path, args) tempFileName = '{}.fa.temp'.format(chromosome) fileName = '{}.fa'.format(chromosome) downloader = utils.HttpFileDownloader(url, tempFileName) downloader.download() # We need to replace the header on the downloaded FASTA with open(tempFileName, "r") as inFasta,\ open(fileName, "w") as outFasta: # Write the new header print(">{}".format(chromosome), file=outFasta) inFasta.readline() for line in inFasta: print(line, file=outFasta, end="") os.unlink(tempFileName) utils.log("Compressing {}".format(fileName)) utils.runCommand("bgzip {}".format(fileName)) compressedFileName = fileName + '.gz' utils.log("Indexing {}".format(compressedFileName)) utils.runCommand("samtools faidx {}".format(compressedFileName)) # Assemble the metadata. metadata = { "md5checksum": getReferenceChecksum(compressedFileName), "sourceUri": url, "ncbiTaxonId": 9606, "isDerived": False, "sourceDivergence": None, "sourceAccessions": [accession + ".subset"], } metadataFilename = "{}.json".format(chromosome) dumpDictToFileAsJson(metadata, metadataFilename) escapeDir(3)
def __init__(self, inputDirectory, outputDirectory): """ Converts human readable dataset from compliance repository, and translates it into a reference-server readable filesystem with binary files. :param inputDirectory: location of the human readable compliance dataset :param outputDirectory: location of the file hierarchy suitable for deploying on the reference server """ self.inputDirectory = inputDirectory self.outputDirectory = outputDirectory self.tempdir = None # If no input directory is specified download from GitHub if inputDirectory is None: utils.log("Downloading test data...") self.tempdir = tempfile.mkdtemp() assert(os.path.exists(self.tempdir)) url = "https://github.com/ga4gh/compliance/archive/master.zip" filePath = os.path.join(self.tempdir, 'compliance-master.zip') downloader = utils.HttpFileDownloader(url, filePath) downloader.download() utils.log("Extracting test data...") with zipfile.ZipFile(filePath, "r") as z: z.extractall(self.tempdir) self.inputDirectory = os.path.join( self.tempdir, 'compliance-master', 'test-data') # get all the reference files (they'll be the ones with .fa extension) self.referenceFiles = map( os.path.basename, glob.glob( os.path.join(self.inputDirectory, "*.fa"))) self.refsetsDirectory = os.path.join( self.outputDirectory, "referenceSets") self.hg37Directory = os.path.join(self.refsetsDirectory, "hg37") # datasets self.datasetsDirectory = os.path.join(self.outputDirectory, "datasets") readFiles = map(os.path.basename, glob.glob( os.path.join(self.inputDirectory, "*.sam"))) variantFiles = map(os.path.basename, glob.glob( os.path.join(self.inputDirectory, "*.vcf"))) sequenceAnnotationFiles = map(os.path.basename, glob.glob( os.path.join(self.inputDirectory, "*.gff3"))) self.datasets = [d for d in set( [p.split('_')[0] for p in (readFiles + variantFiles + sequenceAnnotationFiles)])] self.datasetDirs = [os.path.join( self.outputDirectory, ds) for ds in self.datasets] # Create maps of arrays of files based on dataset. self.datasetReads = dict() self.datasetVariants = dict() self.datasetSequenceAnnotations = dict() for ds in self.datasets: self.datasetReads[ds] = [r for r in readFiles if r.startswith(ds)] self.datasetSequenceAnnotations[ds] = [sa for sa in ( sequenceAnnotationFiles) if sa.startswith(ds)] # Variants themselves are split into groups, # based on second part of the _ split: self.datasetVariants[ds] = dict() # only those variants inside this dataset dsvlist = [v for v in variantFiles if v.startswith(ds)] # create nested dictionary based on group belonging for dsv in dsvlist: dsvGroup = dsv.split('_')[1] self.datasetVariants[ds][dsvGroup] = \ self.datasetVariants[ds].get(dsvGroup, []) + [dsv]
def downloadData(): url = ("https://github.com/ga4gh/server/releases/" "download/data/ga4gh-example-data-v3.2.tar") fileDownloader = utils.HttpFileDownloader(url, tarballPath) fileDownloader.download() utils.log("Downloading finished")
def downloadData(): url = "http://www.well.ox.ac.uk/~jk/ga4gh-example-data.tar" fileDownloader = utils.HttpFileDownloader(url, tarballPath) fileDownloader.download() utils.log("Downloading finished")