def handle(self, *args, **options): # Create all the dummy data that would have been created # before a downloader job could have been generated. survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() file = File( batch=batch, size_in_bytes=0, download_url= "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072_CD_colon_active_2.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL") file.save() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch]) send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
def create_quantpendia(organisms, organisms_exclude): all_organisms = Organism.objects.all() if organisms: organisms = organisms.upper().replace(" ", "_").split(",") all_organisms = all_organisms.filter(name__in=organisms) if organisms_exclude: organisms = organisms_exclude.upper().replace(" ", "_").split(",") all_organisms = all_organisms.exclude(name__in=organisms) logger.debug("Generating quantpendia for organisms", organisms=all_organisms) created_jobs = [] for organism in all_organisms: # only generate the quantpendia for organisms that have some samples # with quant.sf files. has_quantsf_files = organism.sample_set.filter( technology="RNA-SEQ", results__computedfile__filename="quant.sf" ).exists() if not has_quantsf_files: continue job = create_job_for_organism(organism) logger.info( "Sending compendia job for Organism", job_id=str(job.pk), organism=str(organism) ) send_job(ProcessorPipeline.CREATE_QUANTPENDIA, job) created_jobs.append(job) return created_jobs
def create_long_and_short_processor_jobs(files_to_process): """ Creates two processor jobs for the files needed for this transcriptome""" processor_job_long = ProcessorJob() processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG" processor_job_long.ram_amount = 8192 processor_job_long.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_long assoc.save() send_job(ProcessorPipeline[processor_job_long.pipeline_applied], processor_job_long) processor_job_short = ProcessorJob() processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT" processor_job_short.ram_amount = 8192 processor_job_short.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_short assoc.save() send_job(ProcessorPipeline[processor_job_short.pipeline_applied], processor_job_short)
def create_compendia(svd_algorithm, organisms): """Create a compendium for one or more organisms.""" svd_algorithm_choices = ["ARPACK", "RANDOMIZED", "NONE"] if svd_algorithm and svd_algorithm not in svd_algorithm_choices: raise Exception( "Invalid svd_algorithm option provided. Possible values are " + str(svd_algorithm_choices)) svd_algorithm = svd_algorithm or "ARPACK" target_organisms = get_target_organisms(organisms) grouped_organisms = group_organisms_by_biggest_platform(target_organisms) logger.debug("Generating compendia for organisms", organism_groups=str(grouped_organisms)) created_jobs = [] for organism in grouped_organisms: job = create_job_for_organism(organism, svd_algorithm) logger.info("Sending compendia job for Organism", job_id=str(job.pk), organism=str(organism)) send_job(ProcessorPipeline.CREATE_COMPENDIA, job) created_jobs.append(job) return created_jobs
def create_processor_job_for_original_files(original_files: List[OriginalFile], volume_index: int): """ Create a processor job and queue a processor task for sample related to an experiment. """ # If there's no original files then we've created all the jobs we need to! if len(original_files) == 0: return # For anything that has raw data there should only be one Sample per OriginalFile sample_object = original_files[0].samples.first() pipeline_to_apply = determine_processor_pipeline(sample_object, original_files[0]) if pipeline_to_apply == ProcessorPipeline.NONE: logger.info("No valid processor pipeline found to apply to sample.", sample=sample_object.id, original_file=original_files[0].id) for original_file in original_files: original_file.delete_local_file() original_file.is_downloaded = False original_file.save() else: processor_job = ProcessorJob() processor_job.pipeline_applied = pipeline_to_apply.value processor_job.ram_amount = determine_ram_amount( sample_object, processor_job) processor_job.volume_index = volume_index processor_job.save() for original_file in original_files: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job assoc.save() logger.debug("Queuing processor job.", processor_job=processor_job.id) send_job(pipeline_to_apply, processor_job)
def handle(self, *args, **options): """Create a quantpendia for one or more organisms.""" all_organisms = Organism.objects.all() if options["organisms"] is not None: organisms = options["organisms"].upper().replace(" ", "_").split(",") all_organisms = all_organisms.filter(name__in=organisms) if options["organisms_exclude"]: organisms = options["organisms_exclude"].upper().replace( " ", "_").split(",") all_organisms = all_organisms.exclude(name__in=organisms) logger.debug("Generating quantpendia for organisms", organisms=all_organisms) for organism in all_organisms: # only generate the quantpendia for organisms that have some samples # with quant.sf files. has_quantsf_files = organism.sample_set.filter( technology="RNA-SEQ", results__computedfile__filename="quant.sf").exists() if not has_quantsf_files: continue job = create_job_for_organism(organism) logger.info("Sending compendia job for Organism", job_id=str(job.pk), organism=str(organism)) send_job(ProcessorPipeline.CREATE_QUANTPENDIA, job) sys.exit(0)
def run_tximport(): """Creates a tximport job for all eligible experiments.""" eligible_experiments = (Experiment.objects.annotate( num_organisms=Count("organisms")).filter( num_organisms=1, technology="RNA-SEQ", num_processed_samples=0).prefetch_related("samples__results")) paginator = Paginator(eligible_experiments, PAGE_SIZE) page = paginator.page() # Next is to figure out how many samples were processed for # each experiment. Should be able to reuse code from salmon # cause it does this stuff. tximport_pipeline = ProcessorPipeline.TXIMPORT while True: creation_count = 0 for experiment in page.object_list: quant_results = get_quant_results_for_experiment(experiment) if should_run_tximport(experiment, quant_results, True): processor_job = ProcessorJob() processor_job.pipeline_applied = tximport_pipeline.value processor_job.ram_amount = 8192 # This job doesn't need to run on a specific volume # but it uses the same Nomad job as Salmon jobs which # do require the volume index. processor_job.volume_index = random.choice( list(get_active_volumes())) processor_job.save() assoc = ProcessorJobOriginalFileAssociation() # Any original file linked to any sample of the # experiment will work. Tximport is somewhat special # in that it doesn't actuallhy use original files so # this is just used to point to the experiment. assoc.original_file = experiment.samples.all( )[0].original_files.all()[0] assoc.processor_job = processor_job assoc.save() creation_count += 1 try: send_job(tximport_pipeline, processor_job) except Exception: # If we cannot queue the job now the Foreman will do # it later. pass logger.info( "Created %d tximport jobs for experiments past the thresholds.", creation_count) if not page.has_next(): break else: page = paginator.page(page.next_page_number())
def queue_downloader_job_for_original_files( self, original_files: List[OriginalFile], experiment_accession_code: str = None, is_transcriptome: bool = False, ): """Creates a single DownloaderJob with multiple files to download. """ # Transcriptome is a special case because there's no sample_object. # It's alright to re-process transcriptome indices. if is_transcriptome: downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX else: source_urls = [ original_file.source_url for original_file in original_files ] # There is already a downloader job associated with this file. old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url__in=source_urls).count() if old_assocs_count > 0: logger.debug( "We found an existing DownloaderJob for these urls.", source_urls=source_urls) return False sample_object = original_files[0].samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info( "No valid downloader task found for sample.", sample=sample_object.id, original_file=original_files[0].id, ) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment_accession_code downloader_job.save() downloaded_urls = [] for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) downloaded_urls.append(original_file.source_url) try: logger.info( "Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, downloaded_urls=downloaded_urls, ) message_queue.send_job(downloader_task, downloader_job) except: # If we fail to queue the job, it will be requeued. pass
def queue_downloader_job_for_original_files( self, original_files: List[OriginalFile], experiment_accession_code: str = None, is_transcriptome: bool = False): """Creates a single DownloaderJob with multiple files to download. """ source_urls = [ original_file.source_url for original_file in original_files ] # There is already a downloader job associated with this file. old_assocs = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url__in=source_urls) if len(old_assocs) > 0: logger.debug("We found an existing DownloaderJob for these urls.", source_urls=source_urls) return False # Transcriptome is a special case because there's no sample_object. if is_transcriptome: downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX else: sample_object = original_files[0].samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info("No valid downloader task found for sample.", sample=sample_object.id, original_file=original_files[0].id) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment_accession_code downloader_job.save() downloaded_urls = [] for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) downloaded_urls.append(original_file.source_url) try: logger.info("Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, downloaded_urls=downloaded_urls) message_queue.send_job(downloader_task, downloader_job) except Exception as e: # If the task doesn't get sent we don't want the # downloader_job to be left floating logger.exception("Failed to enqueue downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, error=str(e)) downloader_job.success = False downloader_job.failure_reason = str(e) downloader_job.save()
def run_trasnscriptome_processor(self): # Create all the dummy data that would have been created # before a processor job could have been generated. survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch( survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="aegilops_tauschii", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value, ) batch.save() kmer_size_property = BatchKeyValue(batch=batch, key="kmer_size", value="31") kmer_size_property.save() gtf_file = File( name="aegilops_tauschii_short.gtf.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf" "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"), raw_format="gtf.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) gtf_file.save() fasta_file = File( name="aegilops_tauschii_short.fa.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta" "/aegilops_tauschii/dna/Aegilops_tauschii." "ASM34733v1.dna.toplevel.fa.gz"), raw_format="fa.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) fasta_file.save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) logger.info("Queuing a processor job.") send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
def send_janitor_jobs(): """Dispatch a Janitor job for each job queue. TODO: make this dispatch janitor jobs for all job queues. https://github.com/AlexsLemonade/refinebio/issues/2789 """ new_job = ProcessorJob(num_retries=0, pipeline_applied="JANITOR", ram_amount=2048) new_job.save() logger.info("Sending Janitor Job.", job_id=new_job.id) try: send_job(ProcessorPipeline["JANITOR"], job=new_job, is_dispatch=True) except Exception: # If we can't dispatch this job, something else has gone wrong, we can get it next loop. return
def retry_unqueued_survey_jobs() -> None: """Retry survey jobs that never made it into the Batch job queue.""" potentially_lost_jobs = SurveyJob.unqueued_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF ).order_by("created_at") paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info("Not handling unqueued survey jobs " "because there is no capacity for them.") while queue_capacity > 0: for survey_job in database_page.object_list: if send_job(SurveyJobTypes.SURVEYOR, job=survey_job, is_dispatch=True): queue_capacity -= 1 else: # Can't communicate with Batch just now, leave the job for a later loop. break if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_jobs() else: break
def requeue_processor_job(last_job: ProcessorJob) -> None: """Queues a new processor job. The new processor job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 # The Salmon pipeline is quite RAM-sensitive. # Try it again with an increased RAM amount, if possible. new_ram_amount = last_job.ram_amount # These initial values are set in common/job_lookup.py:determine_ram_amount if last_job.pipeline_applied == "SALMON": if new_ram_amount == 12288: new_ram_amount = 16384 elif new_ram_amount == 16384: new_ram_amount = 32768 # The AFFY pipeline is somewhat RAM-sensitive. # Try it again with an increased RAM amount, if possible. elif last_job.pipeline_applied == "AFFY_TO_PCL": if new_ram_amount == 2048: new_ram_amount = 4096 elif new_ram_amount == 4096: new_ram_amount = 8192 new_job = ProcessorJob(num_retries=num_retries, pipeline_applied=last_job.pipeline_applied, ram_amount=new_ram_amount, volume_index=last_job.volume_index) new_job.save() for original_file in last_job.original_files.all(): ProcessorJobOriginalFileAssociation.objects.get_or_create( processor_job=new_job, original_file=original_file) for dataset in last_job.datasets.all(): ProcessorJobDatasetAssociation.objects.get_or_create( processor_job=new_job, dataset=dataset) try: logger.debug( "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.", last_job.id, new_job.id) if send_job(ProcessorPipeline[last_job.pipeline_applied], job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete() except: logger.error( "Failed to requeue Processor Job which had ID %d with a new Processor Job with ID %d.", last_job.id, new_job.id) # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete()
def create_processor_jobs_for_original_files( original_files: List[OriginalFile], downloader_job: DownloaderJob = None): """ Create a processor jobs and queue a processor task for samples related to an experiment. """ for original_file in original_files: sample_object = original_file.samples.first() if not delete_if_blacklisted(original_file): continue pipeline_to_apply = determine_processor_pipeline( sample_object, original_file) if pipeline_to_apply == ProcessorPipeline.NONE: logger.info( "No valid processor pipeline found to apply to sample.", sample=sample_object.id, original_file=original_files[0].id) original_file.delete_local_file() original_file.is_downloaded = False original_file.save() else: processor_job = ProcessorJob() processor_job.pipeline_applied = pipeline_to_apply.value processor_job.ram_amount = determine_ram_amount( sample_object, processor_job) processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job assoc.save() if downloader_job: logger.debug("Queuing processor job.", processor_job=processor_job.id, original_file=original_file.id, downloader_job=downloader_job.id) else: logger.debug("Queuing processor job.", processor_job=processor_job.id, original_file=original_file.id) send_job(pipeline_to_apply, processor_job)
def create_long_and_short_processor_jobs(downloader_job, long_files_to_process, short_files_to_process): """ Creates two processor jobs for the files needed for this transcriptome""" processor_job_long = ProcessorJob() processor_job_long.downloader_job = downloader_job processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG" processor_job_long.ram_amount = 4096 processor_job_long.save() for original_file in long_files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_long assoc.save() try: send_job(ProcessorPipeline[processor_job_long.pipeline_applied], processor_job_long) except Exception: # This is fine, the foreman will requeue these later. logger.exception( "Problem with submitting a long transcriptome index job.") processor_job_short = ProcessorJob() processor_job_short.downloader_job = downloader_job processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT" processor_job_short.ram_amount = 4096 processor_job_short.save() for original_file in short_files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_short assoc.save() try: send_job(ProcessorPipeline[processor_job_short.pipeline_applied], processor_job_short) except Exception: # This is fine, the foreman will requeue these later. logger.exception( "Problem with submitting a long transcriptome index job.")
def run_sra_processor(self): # Create all the dummy data that would have been created # before a processor job could have been generated. survey_job = SurveyJob(source_type="SRA") survey_job.save() batch = Batch( survey_job=survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2500", experiment_accession_code="PRJEB5018", experiment_title="It doesn't really matter.", organism_id=10090, organism_name="MUS MUSCULUS", release_date="2014-03-25", last_uploaded_date="2016-05-20", status=BatchStatuses.NEW.value, ) batch.save() File(name="ERR1680082_1.fastq", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/" "ERR1680082/ERR1680082_1.fastq.gz"), raw_format="fastq", processed_format="sf", internal_location="IlluminaHiSeq2500/SALMON", size_in_bytes=2214725074, batch=batch).save() File(name="ERR1680082_2.fastq", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/" "ERR1680082/ERR1680082_2.fastq.gz"), raw_format="fastq", processed_format="sf", internal_location="IlluminaHiSeq2500/SALMON", size_in_bytes=2214725074, batch=batch).save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) logger.info("Queuing a processor job.") send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
def requeue_survey_job(last_job: SurveyJob) -> None: """Queues a new survey job. The new survey job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 new_job = SurveyJob(num_retries=num_retries, source_type=last_job.source_type) if new_job.num_retries == 1: new_job.ram_amount = 4096 elif new_job.num_retries in [2, 3]: new_job.ram_amount = 16384 else: new_job.ram_amount = 1024 new_job.save() keyvalues = SurveyJobKeyValue.objects.filter(survey_job=last_job) for keyvalue in keyvalues: SurveyJobKeyValue.objects.get_or_create( survey_job=new_job, key=keyvalue.key, value=keyvalue.value, ) logger.debug( "Requeuing SurveyJob which had ID %d with a new SurveyJob with ID %d.", last_job.id, new_job.id, ) try: if send_job(SurveyJobTypes.SURVEYOR, job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with Batch just now, leave the job for a later loop. new_job.delete() except Exception: logger.error( "Failed to requeue Survey Job which had ID %d with a new Surevey Job with ID %d.", last_job.id, new_job.id, ) # Can't communicate with AWS just now, leave the job for a later loop. new_job.delete() return True
def queue_task(processor_job, batch): if batch.pipeline_required in ProcessorPipeline.__members__: send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id) logger.info("Queuing processor job.", downloader_job=job.id, processor_job=processor_job.id, batch=batch.id) return True else: failure_template = "Could not find Processor Pipeline {} in the lookup." failure_message = failure_template.format(batch.pipeline_required) logger.error(failure_message, downloader_job=job.id, batch=batch.id) processor_job.failure_reason = failure_message processor_job.success = False processor_job.retried = True processor_job.save() return False
def requeue_downloader_job(last_job: DownloaderJob) -> None: """Queues a new downloader job. The new downloader job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 new_job = DownloaderJob.create_job_and_relationships( num_retries=num_retries, batches=list(last_job.batches.all()), downloader_task=last_job.downloader_task) logger.info( "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.", last_job.id, new_job.id) send_job(Downloaders[last_job.downloader_task], new_job.id) last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save()
def requeue_processor_job(last_job: ProcessorJob) -> None: """Queues a new processor job. The new processor job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 new_job = ProcessorJob.create_job_and_relationships( num_retries=num_retries, batches=list(last_job.batches.all()), pipeline_applied=last_job.pipeline_applied) logger.info( "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.", last_job.id, new_job.id) send_job(ProcessorPipeline[last_job.pipeline_applied], new_job.id) last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save()
def queue_downloader_jobs(self, batches: List[Batch]): if len(batches) > 0: downloader_task = self.downloader_task() with transaction.atomic(): downloader_job = DownloaderJob.create_job_and_relationships( batches=batches, downloader_task=downloader_task.value) logger.info("Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id) try: send_job(downloader_task, downloader_job.id) except: # If the task doesn't get sent we don't want the # downloader_job to be left floating downloader_job.delete() raise else: logger.info("Survey job found no new Batches.", survey_job=self.survey_job.id)
def create_long_and_short_processor_jobs(files_to_process): """ Creates two processor jobs for the files needed for this transcriptome""" processor_job_long = ProcessorJob() processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG" processor_job_long.ram_amount = 4096 processor_job_long.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_long assoc.save() try: send_job(ProcessorPipeline[processor_job_long.pipeline_applied], processor_job_long) except Exception: # This is fine, the foreman will requeue these later. pass processor_job_short = ProcessorJob() processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT" processor_job_short.ram_amount = 4096 processor_job_short.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_short assoc.save() try: send_job(ProcessorPipeline[processor_job_short.pipeline_applied], processor_job_short) except Exception: # This is fine, the foreman will requeue these later. pass
def handle(self, *args, **options): """Create a compendium for one or more organisms.""" svd_algorithm = options["svd_algorithm"] or "ARPACK" svd_algorithm_choices = ["ARPACK", "RANDOMIZED", "NONE"] if options["svd_algorithm"] and options["svd_algorithm"] not in svd_algorithm_choices: raise Exception( "Invalid svd_algorithm option provided. Possible values are " + str(svd_algorithm_choices) ) target_organisms = self._get_target_organisms(options) grouped_organisms = group_organisms_by_biggest_platform(target_organisms) logger.debug("Generating compendia for organisms", organism_groups=str(grouped_organisms)) for organism in grouped_organisms: job = create_job_for_organism(organism, svd_algorithm) logger.info( "Sending compendia job for Organism", job_id=str(job.pk), organism=str(organism) ) send_job(ProcessorPipeline.CREATE_COMPENDIA, job)
def run_tximport_if_eligible(experiment: Experiment, dispatch_jobs=True) -> bool: """Checks if an experiment is eligible to have tximport run on it and creates a job for it. If the dispatch_jobs parameter is True a Batch job will be dispatched for it. Returns the ProcessorJob if a job was created or None if one was not. """ tximport_pipeline = ProcessorPipeline.TXIMPORT if get_tximport_inputs_if_eligible(experiment, True): processor_job = ProcessorJob() processor_job.pipeline_applied = tximport_pipeline.value processor_job.ram_amount = 32768 processor_job.save() assoc = ProcessorJobOriginalFileAssociation() # Any original file linked to any sample of the # experiment will work. Tximport is somewhat special # in that it doesn't actuallhy use original files so # this is just used to point to the experiment. assoc.original_file = experiment.samples.all()[0].original_files.all( )[0] assoc.processor_job = processor_job assoc.save() if dispatch_jobs: try: send_job(tximport_pipeline, processor_job) except Exception: # If we cannot queue the job now the Foreman will do # it later. pass return processor_job return None
def send_janitor_jobs(): """Dispatch a Janitor job for each instance in the cluster""" try: active_volumes = get_active_volumes() except: # If we cannot reach Nomad now then we can wait until a later loop. pass for volume_index in active_volumes: new_job = ProcessorJob(num_retries=0, pipeline_applied="JANITOR", ram_amount=2048, volume_index=volume_index) new_job.save() logger.info("Sending Janitor with index: ", job_id=new_job.id, index=volume_index) try: send_job(ProcessorPipeline["JANITOR"], job=new_job, is_dispatch=True) except Exception as e: # If we can't dispatch this job, something else has gone wrong. continue
def dispatch_job(self, serializer, obj): processor_job = ProcessorJob() processor_job.pipeline_applied = "SMASHER" processor_job.ram_amount = 4096 processor_job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = processor_job pjda.dataset = obj pjda.save() job_sent = False try: # Hidden method of non-dispatching for testing purposes. if not self.request.data.get("no_send_job", False): job_sent = send_job(ProcessorPipeline.SMASHER, processor_job) else: # We didn't actually send it, but we also didn't want to. job_sent = True except Exception as e: # Just log whatever exception happens, because the foreman wil requeue the job anyway logger.error(e) if not job_sent: raise APIException( "Unable to queue download job. Something has gone" " wrong and we have been notified about it." ) serializer.validated_data["is_processing"] = True obj = serializer.save() # create a new dataset annotation with the information of this request annotation = DatasetAnnotation() annotation.dataset = obj annotation.data = { "start": True, "ip": get_client_ip(self.request), "user_agent": self.request.META.get("HTTP_USER_AGENT", None), } annotation.save()
def requeue_downloader_job(last_job: DownloaderJob) -> None: """Queues a new downloader job. The new downloader job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 new_job = DownloaderJob(num_retries=num_retries, downloader_task=last_job.downloader_task, accession_code=last_job.accession_code) new_job.save() for original_file in last_job.original_files.all(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=new_job, original_file=original_file) logger.debug( "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.", last_job.id, new_job.id) try: if send_job(Downloaders[last_job.downloader_task], job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete() except: logger.error( "Failed to requeue Downloader Job which had ID %d with a new Downloader Job with ID %d.", last_job.id, new_job.id) # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete()
def handle(self, *args, **options): """ Dispatch QN_REFERENCE creation jobs for all Organisms with a platform with enough processed samples. """ organisms = Organism.objects.all() for organism in organisms: samples = Sample.processed_objects.filter( organism=organism, has_raw=True, technology="MICROARRAY", is_processed=True, platform_name__contains="Affymetrix", ) if samples.count() < MIN: logger.info( "Total proccessed samples don't meet minimum threshhold", organism=organism, count=samples.count(), min=MIN, ) continue platform_counts = ( samples.values("platform_accession_code").annotate( dcount=Count("platform_accession_code")).order_by( "-dcount")) biggest_platform = platform_counts[0]["platform_accession_code"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") if sample_codes_results.count() < MIN: logger.info( "Number of processed samples for largest platform didn't mean threshold.", organism=organism, platform_accession_code=biggest_platform, count=sample_codes_results.count(), min=MIN, ) continue sample_codes = [ res["accession_code"] for res in sample_codes_results ] dataset = Dataset() dataset.data = { organism.name + "_(" + biggest_platform + ")": sample_codes } dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() logger.info("Sending QN_REFERENCE for Organism", job_id=str(job.pk), organism=str(organism)) send_job(ProcessorPipeline.QN_REFERENCE, job)
def test_transcriptome_redownloading(self, mock_surveyor): """Survey, download, then process a transcriptome index. """ mock_surveyor.side_effect = build_surveyor_init_mock( "TRANSCRIPTOME_INDEX") # Clear out pre-existing work dirs so there's no conflicts: self.env = EnvironmentVarGuard() self.env.set("RUNING_IN_CLOUD", "False") with self.env: # I'm not sure why, but sometimes there are already downloader jobs # in the database from previous tests even though they should be # removed, so pause a bit time.sleep(10) downloader_jobs = DownloaderJob.objects.all() for job in downloader_jobs: print(job) print(job.accession_code) self.assertEqual(downloader_jobs.count(), 0) for length in ["LONG", "SHORT"]: work_dir_glob = (LOCAL_ROOT_DIR + "/Caenorhabditis_elegans/" + length + "/processor_job_*") for work_dir in glob.glob(work_dir_glob): shutil.rmtree(work_dir) # Prevent a call being made to NCBI's API to determine # organism name/id. organism = Organism(name="CAENORHABDITIS_ELEGANS", taxonomy_id=6239, is_scientific_name=True) organism.save() # Make sure that we can delete the file before the processors begin # by preventing the downloaders from sending the processors # automatically. We send the jobs manually later no_dispatch = EnvironmentVarGuard() no_dispatch.set("AUTO_DISPATCH_NOMAD_JOBS", "False") with no_dispatch: survey_job = surveyor.survey_transcriptome_index( "Caenorhabditis elegans", "Ensembl") self.assertTrue(survey_job.success) downloader_jobs = DownloaderJob.objects.all() self.assertEqual(downloader_jobs.count(), 1) logger.info( "Survey Job finished, waiting for Downloader Job with Nomad ID %s to complete.", downloader_jobs[0].nomad_job_id, ) downloader_job = wait_for_job(downloader_jobs[0], DownloaderJob, timezone.now()) self.assertTrue(downloader_job.success) og_file_to_delete = OriginalFile.objects.all()[0] os.remove(og_file_to_delete.absolute_file_path) processor_jobs = ProcessorJob.objects.all() for processor_job in processor_jobs: # FIXME: we run these in serial because of # https://github.com/AlexsLemonade/refinebio/issues/2321 send_job( ProcessorPipeline[processor_job.pipeline_applied], job=processor_job, is_dispatch=True, ) try: wait_for_job(processor_job, ProcessorJob, timezone.now()) except Exception: pass # The processor job that had a missing file will have # recreated its DownloaderJob, which means there should now be two. downloader_jobs = DownloaderJob.objects.all().order_by("-id") self.assertEqual(downloader_jobs.count(), 2) # However DownloaderJobs don't get queued immediately, so # we have to run a foreman function to make it happen: retry_lost_downloader_jobs() # And we can check that the most recently created # DownloaderJob was successful as well: recreated_job = downloader_jobs[0] recreated_job.refresh_from_db() logger.info("Waiting on downloader Nomad job %s", recreated_job.nomad_job_id) recreated_job = wait_for_job(recreated_job, DownloaderJob, timezone.now()) self.assertTrue(recreated_job.success) # Once the Downloader job succeeds, it should create two # processor jobs, one for long and one for short indices.: processor_jobs = ProcessorJob.objects.all() self.assertEqual(processor_jobs.count(), 4) # Wait for the processor jobs to be dispatched time.sleep(15) # And finally we can make sure that both of the # processor jobs were successful, including the one that # got recreated. logger.info( "Downloader Jobs finished, waiting for processor Jobs to complete." ) successful_processor_jobs = [] for processor_job in processor_jobs: processor_job.refresh_from_db() # One of the calls to wait_for_job will fail if the # job aborts before it we selected all the # processor jobs. processor_job = wait_for_job(processor_job, ProcessorJob, timezone.now()) if processor_job.success: successful_processor_jobs.append(processor_job) # While one of the original ProcessorJobs will be aborted # it is hard to be sure of what will happen # to the other because of the racing that happens between # processor jobs getting started and us deleting the files # they need. # Therefore, we're just going to verify that one processor # job completed successfully for each length, since that # is the main thing we need. has_long = False has_short = False for processor_job in successful_processor_jobs: if processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_LONG": has_long = True elif processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_SHORT": has_short = True self.assertTrue(has_long) self.assertTrue(has_short)
def queue_downloader_jobs(self, experiment: Experiment, samples: List[Sample]): """This enqueues DownloaderJobs on a per-file basis. There is a complementary function below for enqueueing multi-file DownloaderJobs. """ files_to_download = [] for sample in samples: files_for_sample = OriginalFile.objects.filter(sample=sample, is_downloaded=False) for og_file in files_for_sample: files_to_download.append(og_file) download_urls_with_jobs = {} for original_file in files_to_download: # We don't need to create multiple downloaders for the same file. # However, we do want to associate original_files with the # DownloaderJobs that will download them. if original_file.source_url in download_urls_with_jobs.keys(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=download_urls_with_jobs[ original_file.source_url], original_file=original_file, ) continue # There is already a downloader job associated with this file. old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url=original_file.source_url).count() if old_assocs_count > 0: logger.debug( "We found an existing DownloaderJob for this file/url.", original_file_id=original_file.id, ) continue sample_object = original_file.samples.first() downloader_task = determine_downloader_task(sample_object) if downloader_task == Downloaders.NONE: logger.info( "No valid downloader task found for sample.", sample=sample_object.id, original_file=original_file.id, ) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment.accession_code downloader_job.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) download_urls_with_jobs[ original_file.source_url] = downloader_job try: logger.info( "Queuing downloader job for URL: " + original_file.source_url, survey_job=self.survey_job.id, downloader_job=downloader_job.id, ) send_job(downloader_task, downloader_job) except Exception: # If we fail to queue the job, it will be requeued. pass