Example #1
0
    def test_multiple_batches(self, mock_download_file):
        # Just in case this test ever breaks, we don't actually want
        # to download the file because that'll take a while to fail.
        mock_download_file.return_value = True

        batch, _ = self.insert_objects()
        batch2 = Batch(survey_job=self.survey_job,
                       source_type="SRA",
                       pipeline_required="SALMON",
                       platform_accession_code="IlluminaHiSeq2000",
                       experiment_accession_code="DRX001564",
                       experiment_title="It doesn't really matter.",
                       organism_id=9031,
                       organism_name="GALLUS GALLUS",
                       release_date="2013-07-19",
                       last_uploaded_date="2017-09-11",
                       status=BatchStatuses.NEW.value)
        batch2.save()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch, batch2], downloader_task="dummy")
        downloader_job.save()

        sra.download_sra(downloader_job.id)

        completed_job = DownloaderJob.objects.get(id=downloader_job.id)
        self.assertFalse(completed_job.success)
        self.assertEqual(completed_job.failure_reason,
                         ("More than one batch found for SRA downloader job. "
                          "There should only be one."))
Example #2
0
    def test_upload_fails(self, mock_download_file, mock_upload_raw_file,
                          mock_getsize):
        # We don't actually want to download anything and we're
        # testing this function separately anyway.
        mock_download_file.return_value = True

        mock_getsize.return_value = 1337

        def raise_exception(job_dir):
            raise Exception("We're testing that this fails.")

        mock_upload_raw_file.side_effect = raise_exception

        batch, files = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch], downloader_task="dummy")
        downloader_job.save()

        sra.download_sra(downloader_job.id)

        downloader_job.refresh_from_db()
        self.assertFalse(downloader_job.success)
        self.assertEquals(downloader_job.failure_reason,
                          "Exception caught while uploading file.")

        self.assertEquals(len(mock_upload_raw_file.mock_calls), 1)
    def test_verification_failure(self, _upload_files, _download_file,
                                  mock_send_job):
        mock_send_job.return_value = None

        # Set a different download URL to trigger a failure in the
        # _verify_batch_grouping function
        batches = self.insert_objects()
        batches[0].files[0].download_url = "https://wompwomp.com"
        batches[0].files[0].save()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches)

        # Call the downloader function
        transcriptome_index.download_transcriptome(downloader_job.id)

        _download_file.assert_not_called()
        _upload_files.assert_not_called()
        mock_send_job.assert_not_called()

        # Verify that the database has been updated correctly:
        downloader_job = DownloaderJob.objects.get()
        self.assertFalse(downloader_job.success)
        self.assertIsNotNone(downloader_job.start_time)
        self.assertIsNotNone(downloader_job.end_time)
        self.assertEqual(downloader_job.failure_reason,
                         ("A Batch's file doesn't have the same download "
                          "URL as the other batch's file."))
Example #4
0
    def handle(self, *args, **options):
        # Create all the dummy data that would have been created
        # before a downloader job could have been generated.
        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        survey_job.save()

        batch = Batch(survey_job=survey_job,
                      source_type="ARRAY_EXPRESS",
                      pipeline_required="AFFY_TO_PCL",
                      platform_accession_code="A-AFFY-141",
                      experiment_accession_code="E-GEOD-59071",
                      experiment_title="It doesn't really matter.",
                      organism_id=9606,
                      organism_name="H**O SAPIENS",
                      release_date="2017-05-05",
                      last_uploaded_date="2017-05-05",
                      status=BatchStatuses.NEW.value)
        batch.save()

        file = File(
            batch=batch,
            size_in_bytes=0,
            download_url=
            "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
            raw_format="CEL",
            processed_format="PCL",
            name="GSM1426072_CD_colon_active_2.CEL",
            internal_location="A-AFFY-141/AFFY_TO_PCL")
        file.save()

        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch])
        send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
Example #5
0
    def test_good_batch_grouping(self):
        """Returns true if all batches have the same download_url."""
        batches, files = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches, downloader_task="dummy")

        self.assertIsNone(
            array_express._verify_batch_grouping(files, downloader_job))
    def test_good_file_grouping(self):
        """Returns None if both files have the same download_url."""
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[], downloader_task="dummy")

        self.assertIsNone(
            transcriptome_index._verify_files(File(download_url="a"),
                                              File(download_url="a"),
                                              downloader_job))
    def test_bad_file_grouping(self):
        """Raises exception if both files don't have the same download_url."""
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[], downloader_task="dummy")

        with self.assertRaises(ValueError):
            self.assertIsNone(
                transcriptome_index._verify_files(File(download_url="a"),
                                                  File(download_url="b"),
                                                  downloader_job))
Example #8
0
    def test_bad_batch_grouping(self):
        """Raises exception if all batches don't have the same download_url."""
        batches, files = self.insert_objects()
        files[1].download_url = "https://wompwomp.com"
        files[1].save()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches, downloader_task="dummy")

        with self.assertRaises(ValueError):
            array_express._verify_batch_grouping(files, downloader_job)
Example #9
0
    def test_create_job_and_relationships(self):
        """DownloaderJob, Batches, and relationships are created."""
        batches = [get_batch(), get_batch()]

        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches, downloader_task="test")
        self.assertIsInstance(downloader_job.id, int)

        batches_for_job = downloader_job.batches.all()
        self.assertEqual(len(batches_for_job), 2)

        self.assertEqual(batches[0].downloaderjob_set.get(), downloader_job)
    def test_download(self, _upload_files, _download_file, _verify_files,
                      mock_send_job):
        # Clean up temp directory:
        shutil.rmtree(
            "/home/user/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX",
            ignore_errors=True)

        mock_send_job.return_value = None

        batches = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches)

        # Call the downloader function we're testing:
        transcriptome_index.download_transcriptome(downloader_job.id)

        target_gtf_path = (
            "/home/user/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX/downloader_job_{}"
            "/Aegilops_tauschii_short.gtf.gz").format(str(downloader_job.id))
        target_fasta_path = (
            "/home/user/data_store/temp/EnsemblPlants/TRANSCRIPTOME_INDEX/downloader_job_{}"
            "/Aegilops_tauschii_short.fa.gz").format(str(downloader_job.id))

        # Verify that all expected functionality is run:
        self.assertEqual(_verify_files.call_count, 2)
        self.assertEqual(_download_file.call_count, 2)
        _download_file.assert_any_call(self.gtf_download_url, target_gtf_path,
                                       downloader_job)
        _download_file.assert_any_call(self.fasta_download_url,
                                       target_fasta_path, downloader_job)
        args, _ = _upload_files.call_args
        job_dir, files, job = args
        self.assertEqual(set(files), set(batches[0].files + batches[1].files))
        self.assertEqual(job.id, downloader_job.id)

        # Verify that the database has been updated correctly:
        batches = Batch.objects.all()
        for batch in batches:
            self.assertEqual(batch.status, BatchStatuses.DOWNLOADED.value)

        downloader_job = DownloaderJob.objects.get()
        self.assertTrue(downloader_job.success)
        self.assertIsNotNone(downloader_job.start_time)
        self.assertIsNotNone(downloader_job.end_time)

        processor_jobs = ProcessorJob.objects.all()
        self.assertEqual(len(processor_jobs), 2)

        mock_send_job.assert_has_calls([
            call(ProcessorPipeline.TRANSCRIPTOME_INDEX, processor_jobs[0].id),
            call(ProcessorPipeline.TRANSCRIPTOME_INDEX, processor_jobs[1].id)
        ])
Example #11
0
    def test_zero_batches(self, mock_download_file):
        # Just in case this test ever breaks, we don't actually want
        # to download the file because that'll take a while to fail.
        mock_download_file.return_value = True

        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[], downloader_task="dummy")
        downloader_job.save()

        sra.download_sra(downloader_job.id)

        completed_job = DownloaderJob.objects.get(id=downloader_job.id)
        self.assertFalse(completed_job.success)
        self.assertEqual(completed_job.failure_reason, "No batches found.")
Example #12
0
    def test_download_file(self, mock_urlopen):
        def raise_url_error(url):
            raise URLError("We're testing that {} is unavailable".format(url))

        mock_urlopen.side_effect = raise_url_error

        batch, files = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch], downloader_task="dummy")

        self.assertFalse(
            sra._download_file(files[0],
                               downloader_job,
                               "target_file_path",
                               force_ftp=True))
        self.assertNotEqual(downloader_job.failure_reason, None)  # noqa
Example #13
0
    def test_happy_path(self, mock_download_file, mock_send_job,
                        mock_upload_raw_file, mock_getsize):
        mock_send_job.return_value = None

        # We don't actually want to download anything and we're
        # testing this function separately anyway.
        mock_download_file.return_value = True

        mock_getsize.return_value = 1337

        batch, files = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch], downloader_task="dummy")
        downloader_job.save()

        sra.download_sra(downloader_job.id)

        downloader_job.refresh_from_db()
        self.assertTrue(downloader_job.success)
        for file in files:
            file.refresh_from_db()
            self.assertEquals(file.size_in_bytes, 1337)

        processor_job = ProcessorJob.objects.get()

        target_path_template = "/home/user/data_store/temp/IlluminaHiSeq2000/SALMON/downloader_job_{}/DRR002116_{}.fastq.gz"  # noqa
        target_path_1 = target_path_template.format(downloader_job.id, 1)
        target_path_2 = target_path_template.format(downloader_job.id, 2)

        # Impossible to match the exact File and DownloaderJob
        # objects, so rather than trying to do so, just pull them out
        # from the calls and test the path it was called with:
        first_call = mock_download_file.call_args_list[0][0]
        second_call = mock_download_file.call_args_list[1][0]
        mock_download_file.assert_has_calls([
            call(first_call[0], first_call[1], target_path_2),
            call(second_call[0], second_call[1], target_path_1)
        ])

        mock_send_job.assert_called_once_with(ProcessorPipeline.SALMON,
                                              processor_job.id)

        self.assertEquals(len(mock_upload_raw_file.mock_calls), 2)
Example #14
0
    def test_download(self, _extract_file, _download_file,
                      _verify_batch_grouping, mock_send_job):
        mock_send_job.return_value = None

        batches, files = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches)

        # Call the function we're testing:
        array_express.download_array_express(downloader_job.id)

        target_file_path = (
            "/home/user/data_store/temp/A-AFFY-1/AFFY_TO_PCL/downloader_job_{}"
            "/E-GEOD-59071.raw.3.zip").format(str(downloader_job.id))

        # Verify that all expected functionality is run:
        self.assertEqual(_verify_batch_grouping.call_count, 1)
        download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"  # noqa
        _download_file.assert_called_with(download_url, target_file_path,
                                          downloader_job)
        args, _ = _extract_file.call_args
        file_query_set, job = args
        self.assertEqual(list(file_query_set), files)
        self.assertEqual(job.id, downloader_job.id)

        # Verify that the database has been updated correctly:
        batches = Batch.objects.all()
        for batch in batches:
            self.assertEqual(batch.status, BatchStatuses.DOWNLOADED.value)

        downloader_job = DownloaderJob.objects.get()
        self.assertTrue(downloader_job.success)
        self.assertIsNotNone(downloader_job.start_time)
        self.assertIsNotNone(downloader_job.end_time)

        processor_jobs = ProcessorJob.objects.all()
        self.assertEqual(len(processor_jobs), 2)

        mock_send_job.assert_has_calls([
            call(ProcessorPipeline.AFFY_TO_PCL, processor_jobs[0].id),
            call(ProcessorPipeline.AFFY_TO_PCL, processor_jobs[1].id)
        ])
Example #15
0
    def queue_downloader_jobs(self, batches: List[Batch]):
        if len(batches) > 0:
            downloader_task = self.downloader_task()

            with transaction.atomic():
                downloader_job = DownloaderJob.create_job_and_relationships(
                    batches=batches, downloader_task=downloader_task.value)

            logger.info("Queuing downloader job.",
                        survey_job=self.survey_job.id,
                        downloader_job=downloader_job.id)
            try:
                send_job(downloader_task, downloader_job.id)
            except:
                # If the task doesn't get sent we don't want the
                # downloader_job to be left floating
                downloader_job.delete()
                raise
        else:
            logger.info("Survey job found no new Batches.",
                        survey_job=self.survey_job.id)
Example #16
0
def requeue_downloader_job(last_job: DownloaderJob) -> None:
    """Queues a new downloader job.

    The new downloader job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    new_job = DownloaderJob.create_job_and_relationships(
        num_retries=num_retries,
        batches=list(last_job.batches.all()),
        downloader_task=last_job.downloader_task)
    logger.info(
        "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.",
        last_job.id, new_job.id)
    send_job(Downloaders[last_job.downloader_task], new_job.id)

    last_job.retried = True
    last_job.success = False
    last_job.retried_job = new_job
    last_job.save()
Example #17
0
    def test_extraction_failure(self, _download_file, mock_send_job):
        mock_send_job.return_value = None

        batches, files = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches)

        # Call the downloader function
        array_express.download_array_express(downloader_job.id)

        mock_send_job.assert_not_called()

        # Verify that the database has been updated correctly:
        downloader_job = DownloaderJob.objects.get()
        self.assertFalse(downloader_job.success)
        self.assertIsNotNone(downloader_job.start_time)
        self.assertIsNotNone(downloader_job.end_time)

        job_dir = utils.JOB_DIR_PREFIX + str(downloader_job.id)
        zip_path = files[0].get_temp_download_path(job_dir)
        self.assertEqual(downloader_job.failure_reason,
                         "Exception caught while extracting " + zip_path)
Example #18
0
    def test_download_failure(self, _extract_file, _open, mock_send_job):
        # Set up mocks:
        mock_send_job.return_value = None
        _open.side_effect = Exception()

        batches, _ = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches)

        # Call the downloader function
        array_express.download_array_express(downloader_job.id)

        _extract_file.assert_not_called()
        mock_send_job.assert_not_called()

        # Verify that the database has been updated correctly:
        downloader_job = DownloaderJob.objects.get()
        self.assertFalse(downloader_job.success)
        self.assertIsNotNone(downloader_job.start_time)
        self.assertIsNotNone(downloader_job.end_time)
        self.assertEqual(downloader_job.failure_reason,
                         "Exception caught while downloading batch")
    def test_download_failure(self, _upload_files, _open, mock_send_job):
        # Set up mocks:
        mock_send_job.return_value = None
        _open.side_effect = Exception()

        batches = self.insert_objects()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=batches)

        # Call the downloader function
        transcriptome_index.download_transcriptome(downloader_job.id)

        _upload_files.assert_not_called()
        mock_send_job.assert_not_called()

        # Verify that the database has been updated correctly:
        downloader_job = DownloaderJob.objects.get()
        self.assertFalse(downloader_job.success)
        self.assertIsNotNone(downloader_job.start_time)
        self.assertIsNotNone(downloader_job.end_time)
        failure_reason = "Exception caught while downloading file from: {}".format(
            batches[0].files[0].download_url)
        self.assertEqual(downloader_job.failure_reason, failure_reason)
Example #20
0
 def create_batch_and_downloader_job(self) -> DownloaderJob:
     batch = self.insert_batch()
     return DownloaderJob.create_job_and_relationships(
         num_retries=0, batches=[batch], downloader_task="ARRAY_EXPRESS")