Example #1
0
    def test_unmated_reads(self):
        """Survey, download, then process a sample we know is SRA and has unmated reads.

        This test uses VCR to remove the dependence upon NCBI's
        servers, but the downloader job hits ENA's FTP and aspera
        servers. Unfortunately there's not much that can be done to
        avoid that behavior from here because the downloader jobs
        always check ENA's FTP server to see if the file has an
        unmated read. For now we'll just have to be content with the
        fact that NCBI going down won't affect this test.
        """
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            # Survey just a single run to make things faster!
            # This sample has unmated reads!
            survey_job = surveyor.survey_experiment("SRR1603661", "SRA")

            self.assertTrue(survey_job.success)

            # Let's give the downloader a little bit to get started
            # and to update the OriginalFiles' source_urls.
            time.sleep(60)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)
            downloader_job = downloader_jobs.first()

            self.assertIsNotNone(downloader_job.start_time)

            for original_file in downloader_job.original_files.all():
                self.assertTrue(".fastq.gz" in original_file.source_url)

            # The downloader job will take a while to complete. Let's not wait.
            print(downloader_job.kill_nomad_job())
Example #2
0
    def test_survey(self):
        """Survey the given sample"""

        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            survey_job = surveyor.survey_experiment(
                get_env_variable("ACCESSION"), get_env_variable("SURVEYOR"))

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertGreater(downloader_jobs.count(), 0)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            for downloader_job in downloader_jobs:
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time)
                self.assertTrue(downloader_job.success)

            processor_jobs = ProcessorJob.objects.all().exclude(
                abort=True)  # exclude aborted processor jobs
            self.assertGreater(processor_jobs.count(), 0)

            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            start_time = timezone.now()
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                if not processor_job.success:
                    logger.error(processor_job.failure_reason)
                self.assertTrue(processor_job.success)
Example #3
0
def run_surveyor_for_accession(accession: str) -> None:
    """Chooses the correct surveyor based on the pattern of the accession"""
    if "GSE" in accession[:3]:
        surveyor.survey_experiment(accession, "GEO")
    elif "E-" in accession[:2]:
        surveyor.survey_experiment(accession, "ARRAY_EXPRESS")
    elif " " in accession:
        args = accession.split(",")
        # Allow organism to be unspecified so we survey the entire division.
        organism = args[0] if len(args[0]) > 0 else None
        if len(args) > 1:
            division = args[1].strip()
        else:
            division = "Ensembl"
        surveyor.survey_transcriptome_index(organism, division)
    else:
        surveyor.survey_experiment(accession, "SRA")
Example #4
0
    def test_no_op(self):
        """Survey, download, then process an experiment we know is NO_OP."""
        # Clear out pre-existing work dirs so there's no conflicts:

        self.env = EnvironmentVarGuard()
        self.env.set('RUNING_IN_CLOUD', 'False')
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # Make sure there are no already existing jobs we might poll for unsuccessfully.
            DownloaderJobOriginalFileAssociation.objects.all().delete()
            DownloaderJob.objects.all().delete()
            ProcessorJobOriginalFileAssociation.objects.all().delete()
            ProcessorJob.objects.all().delete()

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            accession_code = "E-GEOD-3303"
            survey_job = surveyor.survey_experiment(accession_code,
                                                    "ARRAY_EXPRESS")

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertGreater(downloader_jobs.count(), 0)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            for downloader_job in downloader_jobs:
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time)
                self.assertTrue(downloader_job.success)

            processor_jobs = ProcessorJob.objects.all()
            self.assertGreater(processor_jobs.count(), 0)

            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            start_time = timezone.now()
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                self.assertTrue(processor_job.success)

            # Test that the unsurveyor deletes all objects related to the experiment
            purge_experiment(accession_code)

            self.assertEqual(Experiment.objects.all().count(), 0)
            self.assertEqual(ExperimentAnnotation.objects.all().count(), 0)
            self.assertEqual(ExperimentSampleAssociation.objects.all().count(),
                             0)
            self.assertEqual(Sample.objects.all().count(), 0)
            self.assertEqual(SampleAnnotation.objects.all().count(), 0)
            self.assertEqual(OriginalFile.objects.all().count(), 0)
            self.assertEqual(
                OriginalFileSampleAssociation.objects.all().count(), 0)
            self.assertEqual(SampleResultAssociation.objects.all().count(), 0)
            self.assertEqual(ComputationalResult.objects.all().count(), 0)
            self.assertEqual(
                ComputationalResultAnnotation.objects.all().count(), 0)
            self.assertEqual(
                SampleComputedFileAssociation.objects.all().count(), 0)
            self.assertEqual(ComputedFile.objects.all().count(), 0)
            self.assertEqual(DownloaderJob.objects.all().count(), 0)
            self.assertEqual(
                DownloaderJobOriginalFileAssociation.objects.all().count(), 0)
            self.assertEqual(ProcessorJob.objects.all().count(), 0)
            self.assertEqual(
                ProcessorJobOriginalFileAssociation.objects.all().count(), 0)
Example #5
0
    def test_sra_redownloading(self):
        """Survey, download, then process an experiment we know is SRA."""
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            survey_job = surveyor.survey_experiment("SRP040623", "SRA")

            self.assertTrue(survey_job.success)

            # This experiment has 4 samples that each need a downloader job.
            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 4)

            # We want one ProcessorJob to fail because it doesn't have
            # the file it was expecting, so we need to wait until one
            # DownloaderJob finishes, delete a file that is
            # downloaded, and then not delete any more.
            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            file_deleted = False
            for downloader_job in downloader_jobs:
                # We want to try and delete the file as quickly as
                # possible, so pass a short loop time and let the waiting
                # loop spin really fast so we lose as little time as
                # possible.
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time, 0.1)
                self.assertTrue(downloader_job.success)
                if not file_deleted:
                    for original_file in OriginalFile.objects.filter(
                            is_downloaded=True):
                        if not original_file.is_archive:
                            original_file.delete_local_file()
                            file_deleted = True

                            # And then to make sure that we can handle
                            # cases where the downloader job is missing:
                            downloader_job.delete()
                            break

            # There's a chance that the processor job with a missing
            # file is aborted before the last downloader job
            # completes, therefore just check that there's at least 3
            # processor jobs.
            processor_jobs = ProcessorJob.objects.all()
            self.assertGreater(processor_jobs.count(), 2)

            doomed_processor_job = original_file.processor_jobs.all()[0]
            logger.info(
                "Waiting on processor Nomad job %s to fail because it realized it is missing a file.",
                doomed_processor_job.nomad_job_id,
            )

            start_time = timezone.now()
            wait_for_job(doomed_processor_job, ProcessorJob, start_time)

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should
            # now be 5, but we also deleted on on purpose so there's 4.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(), 4)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         start_time)
            self.assertTrue(recreated_job.success)

            # Once the Downloader job succeeds, it should create one
            # and only one processor job, which the total goes back up to 4:
            self.assertEqual(ProcessorJob.objects.all().count(), 4)

            # And finally we can make sure that all of the processor
            # jobs got started correctly, including the one that got
            # recreated. However in order to save time when running
            # tests, we don't actually want to run the full salmon
            # processor. Therefore we don't have the transcriptome
            # index that is needed for this organism so the jobs will
            # fail, but that failure happens past the point that we're
            # testing.
            # So we're gonna check for the correct failure_reason.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            good_failure_reason = "Missing transcriptome index."
            successful_processor_jobs = []
            for processor_job in processor_jobs:
                # One of the two calls to wait_for_job will fail
                # because the job is going to abort when it
                # finds that the file it wants to process is missing.
                try:
                    processor_job = wait_for_job(processor_job, ProcessorJob,
                                                 start_time)
                    if not processor_job.success and processor_job.failure_reason.startswith(
                            good_failure_reason):
                        successful_processor_jobs.append(processor_job)
                except Exception:
                    pass

            self.assertEqual(len(successful_processor_jobs), 4)
Example #6
0
    def test_geo_celgz_redownloading(self):
        """Survey, download, then process an experiment we know is Affymetrix.

        Each of the experiment's samples are in their own .cel.gz
        file, which is another way we expect GEO data to come.

        This is another test which uses Aspera so it unfortunately
        cannot be made to run without relying on NCBI's aspera server.
        """
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            # Clear out pre-existing work dirs so there's no conflicts:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="MUS_MUSCULUS",
                                taxonomy_id=10090,
                                is_scientific_name=True)
            organism.save()

            accession_code = "GSE100388"
            survey_job = surveyor.survey_experiment(accession_code, "GEO")

            SAMPLES_IN_EXPERIMENT = 15

            self.assertTrue(survey_job.success)

            # This experiment's samples each have their own file so
            # they each get their own downloader job.
            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), SAMPLES_IN_EXPERIMENT)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )

            # We're going to spin as fast as we can so we can delete
            # the file in between when the downloader jobs finishes and
            # the processor job starts.
            start_time = timezone.now()
            file_deleted = False
            while not file_deleted and timezone.now(
            ) - start_time < MAX_WAIT_TIME:
                non_archive_files = OriginalFile.objects.filter(
                    is_archive=False)
                for original_file in non_archive_files:
                    if original_file.absolute_file_path and os.path.exists(
                            original_file.absolute_file_path):
                        os.remove(original_file.absolute_file_path)
                        file_deleted = True
                        break

            # Wait for each of the DownloaderJobs to finish
            for downloader_job in downloader_jobs:
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time)
                self.assertTrue(downloader_job.success)

            try:
                doomed_processor_job = original_file.processor_jobs.all()[0]
            except Exception:
                # The doomed job may be aborted before we can get
                # it. This is fine, we just can't look at it.
                doomed_processor_job = None

            if doomed_processor_job:
                logger.info(
                    "Waiting on processor Nomad job %s to fail because it realized it is missing a file.",
                    doomed_processor_job.nomad_job_id,
                )

                start_time = timezone.now()
                doomed_processor_job = wait_for_job(doomed_processor_job,
                                                    ProcessorJob, start_time)
                self.assertTrue(doomed_processor_job.abort)

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should
            # now be SAMPLES_IN_EXPERIMENT + 1 downloader jobs.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(),
                             SAMPLES_IN_EXPERIMENT + 1)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         start_time)
            self.assertTrue(recreated_job.success)

            # And finally we can make sure that all of the processor
            # jobs were successful, including the one that got
            # recreated. The processor job that recreated that job has
            # abort=True
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            processor_jobs = ProcessorJob.objects.all().exclude(
                abort=True)  # exclude aborted jobs
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                self.assertTrue(processor_job.success)

            self.assertEqual(processor_jobs.count(), SAMPLES_IN_EXPERIMENT)
Example #7
0
    def test_geo_archive_redownloading(self):
        """Survey, download, then process an experiment we know is NO_OP.

        All the data for the experiment are in the same archive, which
        is one of ways we expect GEO data to come.

        This is another test which uses Aspera so it unfortunately
        cannot be made to run without relying on NCBI's aspera server.
        """
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            accession_code = "GSE102571"
            survey_job = surveyor.survey_experiment(accession_code, "GEO")

            self.assertTrue(survey_job.success)

            # This experiment has multiple samples that are contained in the
            # same archive, so only one job is needed.
            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)

            logger.info(
                "Survey Job finished, waiting for Downloader Job with Nomad ID %s to complete.",
                downloader_jobs[0].nomad_job_id,
            )

            # We're going to spin as fast as we can so we can delete
            # the file in between when the downloader job finishes and
            # the processor job starts.
            start_time = timezone.now()
            file_deleted = False
            while not file_deleted and timezone.now(
            ) - start_time < MAX_WAIT_TIME:
                non_archive_files = OriginalFile.objects.filter(
                    is_archive=False)
                for original_file in non_archive_files:
                    if original_file.absolute_file_path and os.path.exists(
                            original_file.absolute_file_path):
                        os.remove(original_file.absolute_file_path)
                        file_deleted = True
                        break

            downloader_job = wait_for_job(downloader_jobs[0], DownloaderJob,
                                          start_time)
            self.assertTrue(downloader_job.success)

            try:
                doomed_processor_job = original_file.processor_jobs.all()[0]
            except Exception:
                # The doomed job may be aborted before we can get
                # it. This is fine, we just can't look at it.
                doomed_processor_job = None

            if doomed_processor_job:
                logger.info(
                    "Waiting on processor Nomad job %s to fail because it realized it is missing a file.",
                    doomed_processor_job.nomad_job_id,
                )

                start_time = timezone.now()
                doomed_processor_job = wait_for_job(doomed_processor_job,
                                                    ProcessorJob, start_time)
                self.assertTrue(doomed_processor_job.abort)

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should now be two.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(), 2)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         start_time)
            self.assertTrue(recreated_job.success)

            # And finally we can make sure that all of the
            # processor jobs were successful, including the one that
            # got recreated.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            processor_jobs = ProcessorJob.objects.all().exclude(
                abort=True)  # exclude aborted processor jobs
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                if not processor_job.success:
                    logger.error(processor_job.failure_reason)
                self.assertTrue(processor_job.success)

            # Apparently this experiment has a variable number of
            # files because GEO processed experiments sometimes do...
            # However this is okay because there's at least one file
            # per sample, so each sample will get processed at least
            # once and it's the best we can do with the state of GEO.
            # Anyway, all of that is an explanation for why we count
            # how many samples there are rather than just expecting
            # how many we know the experiment has.
            self.assertEqual(processor_jobs.count(),
                             Sample.objects.all().count())
Example #8
0
    def test_array_express_redownloading(self, mock_surveyor):
        """Survey, download, then process an experiment we know is NO_OP."""

        mock_surveyor.side_effect = build_surveyor_init_mock("ARRAY_EXPRESS")
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            NUM_SAMPLES_IN_EXPERIMENT = 12
            accession_code = "E-GEOD-3303"
            survey_job = surveyor.survey_experiment(accession_code,
                                                    "ARRAY_EXPRESS")

            self.assertTrue(survey_job.success)

            # All of this experiment's samples are contained in the
            # same archive, so only one job is needed.
            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            # We want to try and delete the file as quickly as
            # possible, so pass a short loop time and let the waiting
            # loop spin really fast so we lose as little time as
            # possible.
            downloader_job = wait_for_job(downloader_jobs[0], DownloaderJob,
                                          start_time, 0.1)
            self.assertTrue(downloader_job.success)

            # Now we're going to delete one of the extracted files but not the other.
            deleted_file = OriginalFile.objects.filter(
                is_archive=False).first()
            self.assertIsNotNone(deleted_file)
            deleted_file.delete_local_file()

            # The one downloader job should have extracted all the files
            # and created as many processor jobs.
            processor_jobs = ProcessorJob.objects.all()
            self.assertEqual(processor_jobs.count(), NUM_SAMPLES_IN_EXPERIMENT)

            doomed_processor_job = deleted_file.processor_jobs.all()[0]
            logger.info(
                "Waiting on processor Nomad job %s to fail because it realized it is missing a file.",
                doomed_processor_job.nomad_job_id,
            )

            start_time = timezone.now()
            doomed_processor_job = wait_for_job(doomed_processor_job,
                                                ProcessorJob, start_time)
            self.assertTrue(doomed_processor_job.abort)

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should now be two.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(), 2)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         start_time)
            self.assertTrue(recreated_job.success)

            # Once the Downloader job succeeds, it should create one
            # and only one processor job, after which the total goes back up
            # to NUM_SAMPLES_IN_EXPERIMENT:
            processor_jobs = ProcessorJob.objects.all().exclude(
                abort=True)  # exclude aborted processor jobs
            logger.error(processor_jobs)
            self.assertEqual(processor_jobs.count(), NUM_SAMPLES_IN_EXPERIMENT)

            # And finally we can make sure that all of the
            # processor jobs were successful, including the one that
            # got recreated.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                self.assertTrue(processor_job.success)