def download_sra(job_id: int) -> None: """The main function for the SRA Downloader. Fairly straightforward, just downloads the file from SRA. """ job = utils.start_job(job_id) original_files = job.original_files.all() original_file = original_files.first() sample = original_file.samples.first() downloaded_files = [] success = None for original_file in original_files: exp_path = LOCAL_ROOT_DIR + "/" + job.accession_code samp_path = exp_path + "/" + sample.accession_code os.makedirs(exp_path, exist_ok=True) os.makedirs(samp_path, exist_ok=True) dl_file_path = samp_path + "/" + original_file.source_filename success = _download_file(original_file, job, dl_file_path) if success: original_file.set_downloaded(dl_file_path) # ENA's file-report endpoint only reports on .fastq files, # so we can only check expected md5/size_in_bytes for # those files. if ".fastq" in original_file.source_filename: md5_mismatch = ( original_file.expected_md5 and original_file.md5 != original_file.expected_md5) size_in_bytes_mismatch = (original_file.expected_size_in_bytes and original_file.size_in_bytes != original_file.expected_size_in_bytes) if md5_mismatch or size_in_bytes_mismatch: success = False job.failure_reason = "md5 or size_in_bytes didn't match" logger.error( job.failure_reason, expected_md5=original_file.expected_md5, actual_md5=original_file.md5, expected_size_in_bytes=original_file. expected_size_in_bytes, actual_size_in_bytes=original_file.size_in_bytes, ) break downloaded_files.append(original_file) else: break if success: create_processor_job_for_original_files(downloaded_files, job) utils.end_downloader_job(job, success) return success, downloaded_files
def download_sra(job_id: int) -> None: """The main function for the SRA Downloader. Fairly straightforward, just downloads the file from SRA. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) original_files = job.original_files.all() original_file = original_files[0] sample = original_file.samples.first() if _has_unmated_reads(sample.accession_code): original_files = _replace_dotsra_with_fastq_files( sample, job, original_file) else: # _replace_dotsra_with_fastq_files returns a list of # OriginalFiles so turn the queryset of # DownloaderJobOriginalFileAssociations into a list of # OriginalFiles to be consistent. original_files = [assoc.original_file for assoc in file_assocs] downloaded_files = [] success = None for original_file in original_files: if original_file.is_downloaded: logger.info("File already downloaded!", original_file_id=original_file.id, downloader_job=job_id) success = True continue exp_path = LOCAL_ROOT_DIR + "/" + job.accession_code samp_path = exp_path + "/" + sample.accession_code os.makedirs(exp_path, exist_ok=True) os.makedirs(samp_path, exist_ok=True) dl_file_path = samp_path + "/" + original_file.source_filename success = _download_file(original_file.source_url, job, dl_file_path) if success: original_file.is_downloaded = True original_file.absolute_file_path = dl_file_path original_file.filename = original_file.source_filename original_file.is_archive = False original_file.calculate_size() original_file.calculate_sha1() original_file.save() downloaded_files.append(original_file) else: break if success: create_processor_job_for_original_files(downloaded_files, job) utils.end_downloader_job(job, success) return success, downloaded_files
def download_sra(job_id: int) -> None: """The main function for the SRA Downloader. Fairly straightforward, just downloads the Batch's file from SRA and pushes it into Temporary Storage. """ job = utils.start_job(job_id) batches = job.batches.all() success = True job_dir = utils.JOB_DIR_PREFIX + str(job_id) # There should only be one batch per SRA job. if batches.count() == 1: files = File.objects.filter(batch=batches[0]) # All the files will be downloaded to the same directory target_directory = files[0].get_temp_dir(job_dir) os.makedirs(target_directory, exist_ok=True) elif batches.count() > 1: message = "More than one batch found for SRA downloader job. There should only be one." logger.error(message, downloader_job=job_id) job.failure_reason = message success = False else: message = "No batches found." logger.error(message, downloader_job=job_id) job.failure_reason = message success = False if success: for file in files: target_file_path = file.get_temp_pre_path(job_dir) success = _download_file(file, job, target_file_path) # If a download fails stop the job and fail gracefully. if not success: break try: file.size_in_bytes = os.path.getsize(target_file_path) file.save() file.upload_raw_file(job_dir) except Exception: logger.exception("Exception caught while uploading file.", downloader_job=job.id, batch=batches[0].id, file=file.id, file_name=file.name) job.failure_reason = "Exception caught while uploading file." success = False break if success: logger.debug( "Files for batch %s downloaded and extracted successfully.", file.download_url, downloader_job=job_id) utils.end_job(job, batches, success)
def download_transcriptome(job_id: int) -> None: """The main function for the Transcriptome Index Downloader. Two files are needed for the Transcriptome Index Downloader: a fasta file and a gtf file. However each pair need to be processed into two different sized indices. (See the processors.transcriptome_index._create_index function's docstring for more info.) Therefore we only download each set once, then let each processor find it in the same location. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) files_to_process = [] for assoc in file_assocs: original_file = assoc.original_file if original_file.is_archive: filename_species = "".join( original_file.source_filename.split(".")[:-2]) else: # Does this ever happen? filename_species = "".join( original_file.source_filename.split(".")[:-1]) os.makedirs(LOCAL_ROOT_DIR + "/" + filename_species, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + "/" + filename_species + "/" + original_file.source_filename job = _download_file(original_file.source_url, dl_file_path, job) if not job.success: break original_file.is_downloaded = True original_file.absolute_file_path = dl_file_path original_file.filename = original_file.source_filename original_file.is_archive = True original_file.has_raw = True original_file.calculate_size() original_file.calculate_sha1() original_file.save() files_to_process.append(original_file) if job.success: logger.debug("Files downloaded successfully.", downloader_job=job_id) create_long_and_short_processor_jobs(files_to_process) utils.end_downloader_job(job, job.success)
def download_sra(job_id: int) -> None: """The main function for the SRA Downloader. Fairly straightforward, just downloads the file from SRA. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) downloaded_files = [] success = None for assoc in file_assocs: original_file = assoc.original_file if original_file.is_downloaded: logger.info("File already downloaded!", original_file_id=original_file.id, downloader_job=job_id) success = True continue sample_accession_code = original_file.samples.first().accession_code exp_path = LOCAL_ROOT_DIR + '/' + job.accession_code samp_path = exp_path + '/' + sample_accession_code os.makedirs(exp_path, exist_ok=True) os.makedirs(samp_path, exist_ok=True) dl_file_path = samp_path + '/' + original_file.source_filename success = _download_file(original_file.source_url, job, dl_file_path) if success: original_file.is_downloaded = True original_file.absolute_file_path = dl_file_path original_file.filename = original_file.source_filename original_file.is_archive = False original_file.calculate_size() original_file.calculate_sha1() original_file.save() downloaded_files.append(original_file) else: break if success: utils.create_processor_job_for_original_files(downloaded_files, job) utils.end_downloader_job(job, success) return success, downloaded_files
def download_array_express(job_id: int) -> None: """The main function for the Array Express Downloader. Downloads a single zip file containing the .PCL files representing samples relating to a single experiement stored in ArrayExpress. Each of these files is a separate Batch, so the file is unzipped and then each Batch's data is stored in Temporary Storage. """ job = utils.start_job(job_id) batches = job.batches.all() success = True job_dir = utils.JOB_DIR_PREFIX + str(job_id) if batches.count() > 0: files = File.objects.filter(batch__in=batches) target_directory = files[0].get_temp_dir(job_dir) os.makedirs(target_directory, exist_ok=True) target_file_path = files[0].get_temp_download_path(job_dir) download_url = files[0].download_url else: logger.error("No batches found.", downloader_job=job_id) success = False if success: try: _verify_batch_grouping(files, job) # The files for all of the batches in the grouping are # contained within the same zip file. Therefore only # download the one. _download_file(download_url, target_file_path, job) _extract_file(files, job) except Exception: # Exceptions are already logged and handled. # Just need to mark the job as failed. success = False if success: logger.debug("File %s downloaded and extracted successfully.", download_url, downloader_job=job_id) utils.end_job(job, batches, success)
def download_sra(job_id: int) -> None: """The main function for the SRA Downloader. Fairly straightforward, just downloads the file from SRA. """ job = utils.start_job(job_id) original_files = job.original_files.all() original_file = original_files.first() sample = original_file.samples.first() if _has_unmated_reads(sample.accession_code): original_files = _replace_dotsra_with_fastq_files( sample, job, original_file) downloaded_files = [] success = None for original_file in original_files: exp_path = LOCAL_ROOT_DIR + "/" + job.accession_code samp_path = exp_path + "/" + sample.accession_code os.makedirs(exp_path, exist_ok=True) os.makedirs(samp_path, exist_ok=True) dl_file_path = samp_path + "/" + original_file.source_filename success = _download_file(original_file.source_url, job, dl_file_path) if success: original_file.set_downloaded(dl_file_path) downloaded_files.append(original_file) else: break if success: create_processor_job_for_original_files(downloaded_files, job) utils.end_downloader_job(job, success) return success, downloaded_files
def download_array_express(job_id: int) -> None: """The main function for the Array Express Downloader. Downloads a single zip file containing the .PCL files representing samples relating to a single experiement stored in ArrayExpress. """ job = utils.start_job(job_id) success = True file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(downloader_job=job) # AE will have multiple files per DownloaderJob, but they are all # pieces of the same zip file so they're all referencing the same # URL. original_file = file_assocs[0].original_file url = original_file.source_url accession_code = job.accession_code # First, get all the unique sample archive URLs. # There may be more than one! # Then, unpack all the ones downloaded. # Then create processor jobs! og_files = [] # The files for all of the samples are # contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True) # Add a timestamp in milliseconds to filename to prevent multiple jobs from using the same file. filename = url.split('/')[-1] + "." + str(int(time.time() * 1000)) dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + filename + ".zip" _download_file(url, dl_file_path, job) extracted_files = _extract_files(dl_file_path, accession_code, job) for og_file in extracted_files: try: original_file = OriginalFile.objects.get( source_filename=og_file['filename'], source_url=original_file.source_url) original_file.is_downloaded = True original_file.is_archive = False original_file.absolute_file_path = og_file['absolute_path'] original_file.filename = og_file['absolute_path'].split('/')[-1] original_file.calculate_size() original_file.save() original_file.calculate_sha1() og_files.append(original_file) except Exception: # The suspicion is that there are extra files related to # another experiment, that we don't want associated with # this one. logger.debug("Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], downloader_job=job_id) os.remove(og_file["absolute_path"]) continue sample_objects = Sample.objects.filter(originalfile=original_file).order_by('created_at') if sample_objects.count() > 1: logger.warn("Found an Array Express OriginalFile with more than one sample: %s", filename, downloader_job=job_id) # If the file is a .CEL file, it is the ultimate # source of truth about the sample's platform. sample_object = sample_objects[0] if og_file["filename"].upper()[-4:] == ".CEL" and sample_object.has_raw: cel_file_platform = None platform_accession_code = "UNSUPPORTED" try: cel_file_platform = microarray.get_platform_from_CEL( original_file.absolute_file_path) for platform in get_supported_microarray_platforms(): if platform["platform_accession"] == cel_file_platform: platform_accession_code = platform["platform_accession"] except Exception as e: platform_accession_code = "UNDETERMINABLE" logger.warn("Unable to determine platform from CEL file: " + original_file.absolute_file_path, downloader_job=job_id) if platform_accession_code == "UNSUPPORTED": logger.error("Found a raw .CEL file with an unsupported platform!", file_name=original_file.absolute_file_path, sample=sample_object.id, downloader_job=job_id, cel_file_platform=cel_file_platform) job.failure_reason = ("Found a raw .CEL file with an unsupported platform: " + original_file.absolute_file_path + " (" + str(cel_file_platform) + ")") job.no_retry = True success = False # The file is unsupported, delete it! original_file.delete_local_file() original_file.delete() elif platform_accession_code == "UNDETERMINABLE": # If we cannot determine the platform from the # .CEL file, the platform discovered via metadata # may be correct so just leave it be. pass else: # We determined the file was collected with a supported Affymetrix platform. sample_object.platform_accession_code = platform_accession_code sample_object.platform_name = get_readable_affymetrix_names()[ platform_accession_code] # However, if the filename contains '.CEL' we know # it's an Affymetrix Microarray sample_object.technology = "MICROARRAY" sample_object.manufacterer = "AFFYMETRIX" sample_object.save() if success: logger.debug("File downloaded and extracted successfully.", url=url, downloader_job=job_id) utils.create_processor_jobs_for_original_files(og_files, job) utils.end_downloader_job(job, success)
def test_dharma(self): dlj1 = DownloaderJob() dlj1.accession_code = 'D1' dlj1.worker_id = get_instance_id() dlj1.start_time = datetime.datetime.now() dlj1.save() dlj2 = DownloaderJob() dlj2.accession_code = 'D2' dlj2.worker_id = get_instance_id() dlj2.start_time = datetime.datetime.now() dlj2.save() dlj3 = DownloaderJob() dlj3.accession_code = 'D3' dlj3.worker_id = get_instance_id() dlj3.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj3 assoc.save() sample = Sample() sample.accession_code = 'Blahblahblah' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=2, force_harakiri=True) except SystemExit as e: # This is supposed to happen! self.assertTrue(True) exited = True except Exception as e: # This isn't! self.assertTrue(False) self.assertTrue(exited) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=15, force_harakiri=True) except SystemExit as e: # This is not supposed to happen! self.assertTrue(False) exited = True except Exception as e: # This is! self.assertTrue(True) self.assertFalse(exited)
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiment stored in GEO. """ job = utils.start_job(job_id) accession_code = job.accession_code original_file = job.original_files.first() if not original_file: job.failure_reason = "No files associated with the job." logger.error("No files associated with the job.", downloader_job=job_id) utils.end_downloader_job(job, success=False) return url = original_file.source_url related_samples = original_file.samples.exclude(technology="RNA-SEQ") # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + "/" + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + "/" + accession_code + "/" + url.split("/")[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() unpacked_sample_files = [] try: # enumerate all files inside the archive archived_files = list(ArchivedFile(dl_file_path).get_files()) except FileExtractionError as e: job.failure_reason = e logger.exception( "Error occurred while extracting file.", path=dl_file_path, exception=str(e) ) utils.end_downloader_job(job, success=False) return for og_file in archived_files: sample = og_file.get_sample() # We don't want RNA-Seq data from GEO: # https://github.com/AlexsLemonade/refinebio/issues/966 if sample and sample.technology == "RNA-SEQ": logger.warn("RNA-Seq sample found in GEO downloader job.", sample=sample) continue if not sample and ( not og_file.is_processable() or og_file.experiment_accession_code() != accession_code ): # skip the files that we know are not processable and can't be associated with a sample # also skip the files were we couldn't find a sample and they don't mention the current experiment continue potential_existing_file = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=og_file.filename, is_archive=False, ).first() if potential_existing_file: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. if potential_existing_file.needs_processing(): if not potential_existing_file.is_downloaded: potential_existing_file.is_downloaded = True potential_existing_file.save() unpacked_sample_files.append(potential_existing_file) continue # Then this is a new file and we should create an original file for it actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file.file_path actual_file.filename = og_file.filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() # try to see if the file should be associated with a sample if sample: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() else: # if not, we can associate this file with all samples in the experiment for sample in related_samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug( "File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) else: success = False logger.info( "Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) job.failure_reason = "Failed to extract any downloaded files." if success: create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success
def download_transcriptome(job_id: int) -> None: """The main function for the Transcriptome Index Downloader. Two files are needed for the Transcriptome Index Downloader: a fasta file and a gtf file. However each pair need to be processed into two different sized indices. (See the processors.transcriptome_index._create_index function's docstring for more info.) Therefore we only download each set once, then let each processor find it in the same location. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) long_files_to_process = [] short_files_to_process = [] for assoc in file_assocs: long_original_file = assoc.original_file if long_original_file.is_archive: filename_species = "".join( long_original_file.source_filename.split(".")[:-2]) else: # Does this ever happen? filename_species = "".join( long_original_file.source_filename.split(".")[:-1]) # First download the files and make the original files for the # long transciptome index. long_dir = os.path.join(LOCAL_ROOT_DIR, filename_species + "_long") os.makedirs(long_dir, exist_ok=True) long_dl_file_path = os.path.join(long_dir, long_original_file.source_filename) job = _download_file(long_original_file.source_url, long_dl_file_path, job) if not job.success: break long_original_file.is_downloaded = True long_original_file.absolute_file_path = long_dl_file_path long_original_file.filename = long_original_file.source_filename long_original_file.has_raw = True long_original_file.calculate_size() long_original_file.calculate_sha1() long_original_file.save() long_files_to_process.append(long_original_file) # Next copy the files to another directory and create the # original files for the short transcriptome index. short_dir = os.path.join(LOCAL_ROOT_DIR, filename_species + "_short") os.makedirs(short_dir, exist_ok=True) short_dl_file_path = os.path.join(short_dir, long_original_file.source_filename) shutil.copyfile(long_dl_file_path, short_dl_file_path) short_original_file = OriginalFile( source_filename=long_original_file.source_filename, source_url=long_original_file.source_url, is_downloaded=True, absolute_file_path=short_dl_file_path, filename=long_original_file.filename, has_raw=True, ) short_original_file.calculate_size() short_original_file.calculate_sha1() short_original_file.save() short_files_to_process.append(short_original_file) if job.success: logger.debug("Files downloaded successfully.", downloader_job=job_id) create_long_and_short_processor_jobs(job, long_files_to_process, short_files_to_process) utils.end_downloader_job(job, job.success)
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiement stored in GEO. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) original_file = file_assocs[0].original_file url = original_file.source_url accession_code = job.accession_code sample_assocs = OriginalFileSampleAssociation.objects.filter( original_file=original_file) related_samples = Sample.objects.filter( id__in=sample_assocs.values('sample_id')) # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are # contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + url.split( '/')[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() has_raw = True unpacked_sample_files = [] # These files are tarred, and also subsequently gzipped if '.tar' in dl_file_path: try: extracted_files = _extract_tar(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tar file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] if '_' in filename: sample_id = filename.split('_')[0] else: sample_id = filename.split('.')[0] try: sample = Sample.objects.get(accession_code=sample_id) except Exception as e: # We don't have this sample, but it's not a total failure. This happens. continue try: # Files from the GEO supplemental file are gzipped inside of the tarball. Great! archive_file = OriginalFile.objects.get( source_filename__contains=sample_id) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = og_file['absolute_path'] archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() if '.gz' in og_file['filename']: extracted_subfile = _extract_gz(og_file['absolute_path'], accession_code) else: extracted_subfile = [og_file] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile[0][ 'absolute_path'] actual_file.filename = extracted_subfile[0]['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: # TODO - is this worth failing a job for? logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) # If we don't know why we have it, get rid of it. os.remove(og_file["absolute_path"]) # This is a .tgz file. elif '.tgz' in dl_file_path: # If this is the MINiML file, it has been preprocessed if '_family.xml.tgz' in dl_file_path: has_raw = False try: extracted_files = _extract_tgz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tgz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: if '.txt' in og_file['filename']: try: gsm_id = og_file['filename'].split('-')[0] sample = Sample.objects.get(accession_code=gsm_id) except Exception as e: os.remove(og_file["absolute_path"]) continue actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = has_raw actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) # These files are only gzipped. # These are generally the _actually_ raw (rather than the non-raw data in a RAW file) data elif '.gz' in dl_file_path: try: extracted_files = _extract_gz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting gz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] sample_id = filename.split('.')[0] try: # The archive we downloaded archive_file = OriginalFile.objects.get( source_filename__contains=filename) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = dl_file_path archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) os.remove(og_file["absolute_path"]) # This is probably just a .txt file else: filename = dl_file_path.split('/')[-1] sample_id = filename.split('_')[0] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = dl_file_path actual_file.filename = filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug("File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) else: success = False logger.info("Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) job.failure_reason = "Failed to extract any downloaded files." if success: utils.create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success
def download_transcriptome(job_id: int) -> None: """The main function for the Transcriptome Index Downloader. Two files are needed for the Transcriptome Index Downloader: a fasta file and a gtf file. However each pair need to be processed into two different sized indices. (See the processors.transcriptome_index._create_index function's docstring for more info.) Therefore we only download each set once, then push it to Temporary Storage twice. """ job = utils.start_job(job_id) batches = job.batches.all() success = True job_dir = utils.JOB_DIR_PREFIX + str(job_id) try: first_fasta_file = File.objects.get(batch=batches[0], raw_format__exact="fa.gz") first_gtf_file = File.objects.get(batch=batches[0], raw_format__exact="gtf.gz") second_fasta_file = File.objects.get(batch=batches[1], raw_format__exact="fa.gz") second_gtf_file = File.objects.get(batch=batches[1], raw_format__exact="gtf.gz") os.makedirs(first_fasta_file.get_temp_dir(job_dir), exist_ok=True) except Exception: logger.exception( "Failed to retrieve all expected files from database.", downloader_job=job.id) job.failure_reason = "Failed to retrieve all expected files from database." success = False if success: try: _verify_files(first_fasta_file, second_fasta_file, job) _verify_files(first_gtf_file, second_gtf_file, job) # The two Batches share the same fasta and gtf files, so # only download each one once _download_file(first_fasta_file.download_url, first_fasta_file.get_temp_pre_path(job_dir), job) _download_file(first_gtf_file.download_url, first_gtf_file.get_temp_pre_path(job_dir), job) # Then create symlinks so the files for the second Batch # can be found where they will be expected to. try: os.symlink(first_fasta_file.get_temp_pre_path(job_dir), second_fasta_file.get_temp_pre_path(job_dir)) os.symlink(first_gtf_file.get_temp_pre_path(job_dir), second_gtf_file.get_temp_pre_path(job_dir)) except Exception: logger.exception("Exception caught while creating symlinks.", downloader_job=job.id) job.failure_reason = "Exception caught while creating symlinks." raise _upload_files(job_dir, [ first_fasta_file, first_gtf_file, second_fasta_file, second_gtf_file ], job) except Exception: # Exceptions are already logged and handled. # Just need to mark the job as failed. success = False if success: logger.debug("Files %s and %s downloaded successfully.", first_fasta_file, first_gtf_file, downloader_job=job_id) utils.end_job(job, batches, success)