def test_multiple_batches(self, mock_download_file): # Just in case this test ever breaks, we don't actually want # to download the file because that'll take a while to fail. mock_download_file.return_value = True batch, _ = self.insert_objects() batch2 = Batch(survey_job=self.survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2000", experiment_accession_code="DRX001564", experiment_title="It doesn't really matter.", organism_id=9031, organism_name="GALLUS GALLUS", release_date="2013-07-19", last_uploaded_date="2017-09-11", status=BatchStatuses.NEW.value) batch2.save() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch, batch2], downloader_task="dummy") downloader_job.save() sra.download_sra(downloader_job.id) completed_job = DownloaderJob.objects.get(id=downloader_job.id) self.assertFalse(completed_job.success) self.assertEqual(completed_job.failure_reason, ("More than one batch found for SRA downloader job. " "There should only be one."))
def test_upload_fails(self, mock_download_file, mock_upload_raw_file, mock_getsize): # We don't actually want to download anything and we're # testing this function separately anyway. mock_download_file.return_value = True mock_getsize.return_value = 1337 def raise_exception(job_dir): raise Exception("We're testing that this fails.") mock_upload_raw_file.side_effect = raise_exception batch, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch], downloader_task="dummy") downloader_job.save() sra.download_sra(downloader_job.id) downloader_job.refresh_from_db() self.assertFalse(downloader_job.success) self.assertEquals(downloader_job.failure_reason, "Exception caught while uploading file.") self.assertEquals(len(mock_upload_raw_file.mock_calls), 1)
def test_verification_failure(self, _upload_files, _download_file, mock_send_job): mock_send_job.return_value = None # Set a different download URL to trigger a failure in the # _verify_batch_grouping function batches = self.insert_objects() batches[0].files[0].download_url = "https://wompwomp.com" batches[0].files[0].save() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function transcriptome_index.download_transcriptome(downloader_job.id) _download_file.assert_not_called() _upload_files.assert_not_called() mock_send_job.assert_not_called() # Verify that the database has been updated correctly: downloader_job = DownloaderJob.objects.get() self.assertFalse(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) self.assertEqual(downloader_job.failure_reason, ("A Batch's file doesn't have the same download " "URL as the other batch's file."))
def handle(self, *args, **options): # Create all the dummy data that would have been created # before a downloader job could have been generated. survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() file = File( batch=batch, size_in_bytes=0, download_url= "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072_CD_colon_active_2.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL") file.save() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch]) send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
def test_good_batch_grouping(self): """Returns true if all batches have the same download_url.""" batches, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches, downloader_task="dummy") self.assertIsNone( array_express._verify_batch_grouping(files, downloader_job))
def test_good_file_grouping(self): """Returns None if both files have the same download_url.""" downloader_job = DownloaderJob.create_job_and_relationships( batches=[], downloader_task="dummy") self.assertIsNone( transcriptome_index._verify_files(File(download_url="a"), File(download_url="a"), downloader_job))
def test_bad_file_grouping(self): """Raises exception if both files don't have the same download_url.""" downloader_job = DownloaderJob.create_job_and_relationships( batches=[], downloader_task="dummy") with self.assertRaises(ValueError): self.assertIsNone( transcriptome_index._verify_files(File(download_url="a"), File(download_url="b"), downloader_job))
def test_bad_batch_grouping(self): """Raises exception if all batches don't have the same download_url.""" batches, files = self.insert_objects() files[1].download_url = "https://wompwomp.com" files[1].save() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches, downloader_task="dummy") with self.assertRaises(ValueError): array_express._verify_batch_grouping(files, downloader_job)
def test_create_job_and_relationships(self): """DownloaderJob, Batches, and relationships are created.""" batches = [get_batch(), get_batch()] downloader_job = DownloaderJob.create_job_and_relationships( batches=batches, downloader_task="test") self.assertIsInstance(downloader_job.id, int) batches_for_job = downloader_job.batches.all() self.assertEqual(len(batches_for_job), 2) self.assertEqual(batches[0].downloaderjob_set.get(), downloader_job)
def test_download(self, _upload_files, _download_file, _verify_files, mock_send_job): # Clean up temp directory: shutil.rmtree( "/home/user/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX", ignore_errors=True) mock_send_job.return_value = None batches = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function we're testing: transcriptome_index.download_transcriptome(downloader_job.id) target_gtf_path = ( "/home/user/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX/downloader_job_{}" "/Aegilops_tauschii_short.gtf.gz").format(str(downloader_job.id)) target_fasta_path = ( "/home/user/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX/downloader_job_{}" "/Aegilops_tauschii_short.fa.gz").format(str(downloader_job.id)) # Verify that all expected functionality is run: self.assertEqual(_verify_files.call_count, 2) self.assertEqual(_download_file.call_count, 2) _download_file.assert_any_call(self.gtf_download_url, target_gtf_path, downloader_job) _download_file.assert_any_call(self.fasta_download_url, target_fasta_path, downloader_job) args, _ = _upload_files.call_args job_dir, files, job = args self.assertEqual(set(files), set(batches[0].files + batches[1].files)) self.assertEqual(job.id, downloader_job.id) # Verify that the database has been updated correctly: batches = Batch.objects.all() for batch in batches: self.assertEqual(batch.status, BatchStatuses.DOWNLOADED.value) downloader_job = DownloaderJob.objects.get() self.assertTrue(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) processor_jobs = ProcessorJob.objects.all() self.assertEqual(len(processor_jobs), 2) mock_send_job.assert_has_calls([ call(ProcessorPipeline.TRANSCRIPTOME_INDEX, processor_jobs[0].id), call(ProcessorPipeline.TRANSCRIPTOME_INDEX, processor_jobs[1].id) ])
def test_zero_batches(self, mock_download_file): # Just in case this test ever breaks, we don't actually want # to download the file because that'll take a while to fail. mock_download_file.return_value = True downloader_job = DownloaderJob.create_job_and_relationships( batches=[], downloader_task="dummy") downloader_job.save() sra.download_sra(downloader_job.id) completed_job = DownloaderJob.objects.get(id=downloader_job.id) self.assertFalse(completed_job.success) self.assertEqual(completed_job.failure_reason, "No batches found.")
def test_download_file(self, mock_urlopen): def raise_url_error(url): raise URLError("We're testing that {} is unavailable".format(url)) mock_urlopen.side_effect = raise_url_error batch, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch], downloader_task="dummy") self.assertFalse( sra._download_file(files[0], downloader_job, "target_file_path", force_ftp=True)) self.assertNotEqual(downloader_job.failure_reason, None) # noqa
def test_happy_path(self, mock_download_file, mock_send_job, mock_upload_raw_file, mock_getsize): mock_send_job.return_value = None # We don't actually want to download anything and we're # testing this function separately anyway. mock_download_file.return_value = True mock_getsize.return_value = 1337 batch, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch], downloader_task="dummy") downloader_job.save() sra.download_sra(downloader_job.id) downloader_job.refresh_from_db() self.assertTrue(downloader_job.success) for file in files: file.refresh_from_db() self.assertEquals(file.size_in_bytes, 1337) processor_job = ProcessorJob.objects.get() target_path_template = "/home/user/data_store/temp/IlluminaHiSeq2000/SALMON/downloader_job_{}/DRR002116_{}.fastq.gz" # noqa target_path_1 = target_path_template.format(downloader_job.id, 1) target_path_2 = target_path_template.format(downloader_job.id, 2) # Impossible to match the exact File and DownloaderJob # objects, so rather than trying to do so, just pull them out # from the calls and test the path it was called with: first_call = mock_download_file.call_args_list[0][0] second_call = mock_download_file.call_args_list[1][0] mock_download_file.assert_has_calls([ call(first_call[0], first_call[1], target_path_2), call(second_call[0], second_call[1], target_path_1) ]) mock_send_job.assert_called_once_with(ProcessorPipeline.SALMON, processor_job.id) self.assertEquals(len(mock_upload_raw_file.mock_calls), 2)
def test_download(self, _extract_file, _download_file, _verify_batch_grouping, mock_send_job): mock_send_job.return_value = None batches, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the function we're testing: array_express.download_array_express(downloader_job.id) target_file_path = ( "/home/user/data_store/temp/A-AFFY-1/AFFY_TO_PCL/downloader_job_{}" "/E-GEOD-59071.raw.3.zip").format(str(downloader_job.id)) # Verify that all expected functionality is run: self.assertEqual(_verify_batch_grouping.call_count, 1) download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa _download_file.assert_called_with(download_url, target_file_path, downloader_job) args, _ = _extract_file.call_args file_query_set, job = args self.assertEqual(list(file_query_set), files) self.assertEqual(job.id, downloader_job.id) # Verify that the database has been updated correctly: batches = Batch.objects.all() for batch in batches: self.assertEqual(batch.status, BatchStatuses.DOWNLOADED.value) downloader_job = DownloaderJob.objects.get() self.assertTrue(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) processor_jobs = ProcessorJob.objects.all() self.assertEqual(len(processor_jobs), 2) mock_send_job.assert_has_calls([ call(ProcessorPipeline.AFFY_TO_PCL, processor_jobs[0].id), call(ProcessorPipeline.AFFY_TO_PCL, processor_jobs[1].id) ])
def queue_downloader_jobs(self, batches: List[Batch]): if len(batches) > 0: downloader_task = self.downloader_task() with transaction.atomic(): downloader_job = DownloaderJob.create_job_and_relationships( batches=batches, downloader_task=downloader_task.value) logger.info("Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id) try: send_job(downloader_task, downloader_job.id) except: # If the task doesn't get sent we don't want the # downloader_job to be left floating downloader_job.delete() raise else: logger.info("Survey job found no new Batches.", survey_job=self.survey_job.id)
def requeue_downloader_job(last_job: DownloaderJob) -> None: """Queues a new downloader job. The new downloader job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 new_job = DownloaderJob.create_job_and_relationships( num_retries=num_retries, batches=list(last_job.batches.all()), downloader_task=last_job.downloader_task) logger.info( "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.", last_job.id, new_job.id) send_job(Downloaders[last_job.downloader_task], new_job.id) last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save()
def test_extraction_failure(self, _download_file, mock_send_job): mock_send_job.return_value = None batches, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function array_express.download_array_express(downloader_job.id) mock_send_job.assert_not_called() # Verify that the database has been updated correctly: downloader_job = DownloaderJob.objects.get() self.assertFalse(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) job_dir = utils.JOB_DIR_PREFIX + str(downloader_job.id) zip_path = files[0].get_temp_download_path(job_dir) self.assertEqual(downloader_job.failure_reason, "Exception caught while extracting " + zip_path)
def test_download_failure(self, _extract_file, _open, mock_send_job): # Set up mocks: mock_send_job.return_value = None _open.side_effect = Exception() batches, _ = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function array_express.download_array_express(downloader_job.id) _extract_file.assert_not_called() mock_send_job.assert_not_called() # Verify that the database has been updated correctly: downloader_job = DownloaderJob.objects.get() self.assertFalse(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) self.assertEqual(downloader_job.failure_reason, "Exception caught while downloading batch")
def test_download_failure(self, _upload_files, _open, mock_send_job): # Set up mocks: mock_send_job.return_value = None _open.side_effect = Exception() batches = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function transcriptome_index.download_transcriptome(downloader_job.id) _upload_files.assert_not_called() mock_send_job.assert_not_called() # Verify that the database has been updated correctly: downloader_job = DownloaderJob.objects.get() self.assertFalse(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) failure_reason = "Exception caught while downloading file from: {}".format( batches[0].files[0].download_url) self.assertEqual(downloader_job.failure_reason, failure_reason)
def create_batch_and_downloader_job(self) -> DownloaderJob: batch = self.insert_batch() return DownloaderJob.create_job_and_relationships( num_retries=0, batches=[batch], downloader_task="ARRAY_EXPRESS")