Esempio n. 1
0
def run_surveyor_for_accession(accession: str) -> None:
    """Chooses the correct surveyor based on the pattern of the accession"""
    if "GSE" in accession[:3]:
        surveyor.survey_experiment(accession, "GEO")
    elif "E-" in accession[:2]:
        surveyor.survey_experiment(accession, "ARRAY_EXPRESS")
    elif " " in accession:
        args = accession.split(",")
        # Allow organism to be unspecified so we survey the entire division.
        organism = args[0] if len(args[0]) > 0 else None
        if len(args) > 1:
            division = args[1].strip()
        else:
            division = "Ensembl"
        surveyor.survey_transcriptome_index(organism, division)
    else:
        surveyor.survey_experiment(accession, "SRA")
Esempio n. 2
0
    def test_transcriptome_redownloading(self, mock_surveyor):
        """Survey, download, then process a transcriptome index. """

        mock_surveyor.side_effect = build_surveyor_init_mock(
            "TRANSCRIPTOME_INDEX")

        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            # I'm not sure why, but sometimes there are already downloader jobs
            # in the database from previous tests even though they should be
            # removed, so pause a bit
            time.sleep(10)
            downloader_jobs = DownloaderJob.objects.all()
            for job in downloader_jobs:
                print(job)
                print(job.accession_code)
            self.assertEqual(downloader_jobs.count(), 0)

            for length in ["LONG", "SHORT"]:
                work_dir_glob = (LOCAL_ROOT_DIR + "/Caenorhabditis_elegans/" +
                                 length + "/processor_job_*")
                for work_dir in glob.glob(work_dir_glob):
                    shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="CAENORHABDITIS_ELEGANS",
                                taxonomy_id=6239,
                                is_scientific_name=True)
            organism.save()

            # Make sure that we can delete the file before the processors begin
            # by preventing the downloaders from sending the processors
            # automatically. We send the jobs manually later
            no_dispatch = EnvironmentVarGuard()
            no_dispatch.set("AUTO_DISPATCH_NOMAD_JOBS", "False")
            with no_dispatch:
                survey_job = surveyor.survey_transcriptome_index(
                    "Caenorhabditis elegans", "Ensembl")

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)

            logger.info(
                "Survey Job finished, waiting for Downloader Job with Nomad ID %s to complete.",
                downloader_jobs[0].nomad_job_id,
            )

            downloader_job = wait_for_job(downloader_jobs[0], DownloaderJob,
                                          timezone.now())
            self.assertTrue(downloader_job.success)

            og_file_to_delete = OriginalFile.objects.all()[0]
            os.remove(og_file_to_delete.absolute_file_path)

            processor_jobs = ProcessorJob.objects.all()
            for processor_job in processor_jobs:
                # FIXME: we run these in serial because of
                # https://github.com/AlexsLemonade/refinebio/issues/2321
                send_job(
                    ProcessorPipeline[processor_job.pipeline_applied],
                    job=processor_job,
                    is_dispatch=True,
                )
                try:
                    wait_for_job(processor_job, ProcessorJob, timezone.now())
                except Exception:
                    pass

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should now be two.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(), 2)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         timezone.now())
            self.assertTrue(recreated_job.success)

            # Once the Downloader job succeeds, it should create two
            # processor jobs, one for long and one for short indices.:
            processor_jobs = ProcessorJob.objects.all()
            self.assertEqual(processor_jobs.count(), 4)

            # Wait for the processor jobs to be dispatched
            time.sleep(15)

            # And finally we can make sure that both of the
            # processor jobs were successful, including the one that
            # got recreated.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            successful_processor_jobs = []
            for processor_job in processor_jobs:
                processor_job.refresh_from_db()
                # One of the calls to wait_for_job will fail if the
                # job aborts before it we selected all the
                # processor jobs.
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             timezone.now())
                if processor_job.success:
                    successful_processor_jobs.append(processor_job)

            # While one of the original ProcessorJobs will  be aborted
            # it is hard to be sure of what will happen
            # to the other because of the racing that happens between
            # processor jobs getting started and us deleting the files
            # they need.
            # Therefore, we're just going to verify that one processor
            # job completed successfully for each length, since that
            # is the main thing we need.
            has_long = False
            has_short = False
            for processor_job in successful_processor_jobs:
                if processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_LONG":
                    has_long = True
                elif processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_SHORT":
                    has_short = True

            self.assertTrue(has_long)
            self.assertTrue(has_short)
Esempio n. 3
0
    def test_transcriptome_redownloading(self):
        """Survey, download, then process a transcriptome index."""
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set('RUNING_IN_CLOUD', 'False')
        with self.env:
            for length in ["LONG", "SHORT"]:
                work_dir_glob = LOCAL_ROOT_DIR + "/Caenorhabditis_elegans/" + length + "/processor_job_*"
                for work_dir in glob.glob(work_dir_glob):
                    shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            survey_job = surveyor.survey_transcriptome_index(
                "Caenorhabditis elegans", "Ensembl")

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)

            logger.info(
                "Survey Job finished, waiting for Downloader Job with Nomad ID %s to complete.",
                downloader_jobs[0].nomad_job_id)
            og_file_to_delete = OriginalFile.objects.all()[0]
            start_time = timezone.now()

            # We're going to spin as fast as we can so we can delete
            # the file in between when the downloader job finishes and
            # the processor job starts.
            while timezone.now() - start_time < MAX_WAIT_TIME:
                og_file_to_delete.refresh_from_db()
                if og_file_to_delete.absolute_file_path and os.path.exists(
                        og_file_to_delete.absolute_file_path):
                    os.remove(og_file_to_delete.absolute_file_path)
                    break

            # We want to try and delete the file as quickly as
            # possible, so pass a short loop time and let the waiting
            # loop spin really fast so we lose as little time as
            # possible.
            downloader_job = wait_for_job(downloader_jobs[0], DownloaderJob,
                                          start_time)
            self.assertTrue(downloader_job.success)

            start_time = timezone.now()
            processor_jobs = ProcessorJob.objects.all()
            for processor_job in processor_jobs:
                # It's hard to guarantee that we'll be able to delete
                # the files before the first job starts, but since
                # they both don't start at the same time we'll
                # definitely get it before the second one. This is
                # actually kinda desirable for testing though because
                # we should be able to handle it either way.
                try:
                    wait_for_job(processor_job, ProcessorJob, start_time)
                except:
                    pass

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should now be two.
            downloader_jobs = DownloaderJob.objects.all().order_by('-id')
            self.assertEqual(downloader_jobs.count(), 2)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         start_time)
            self.assertTrue(recreated_job.success)

            # Once the Downloader job succeeds, it should create one
            # and only one processor job, which the total goes back up to 2:
            processor_jobs = ProcessorJob.objects.all()
            self.assertEqual(processor_jobs.count(), 3)

            # And finally we can make sure that both of the
            # processor jobs were successful, including the one that
            # got recreated.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            successful_processor_jobs = []
            for processor_job in processor_jobs:
                # One of the calls to wait_for_job will fail if the
                # job deletes itself before it we selected all the
                # processor jobs.
                try:
                    processor_job = wait_for_job(processor_job, ProcessorJob,
                                                 start_time)
                    if processor_job.success:
                        successful_processor_jobs.append(processor_job)
                except:
                    pass

            # While one of the original ProcessorJobs will definitely
            # delete itself, it is hard to be sure of what will happen
            # to the other because of the racing that happens between
            # processor jobs getting started and us deleting the files
            # they need.
            # Therefore, we're just going to verify that one processor
            # job completed successfully for each length, since that
            # is the main thing we need.
            has_long = False
            has_short = False
            for processor_job in successful_processor_jobs:
                if processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_LONG":
                    has_long = True
                elif processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_SHORT":
                    has_short = True

            self.assertTrue(has_long)
            self.assertTrue(has_short)