def retry_failed_survey_jobs() -> None: """Handle survey jobs that were marked as a failure.""" failed_jobs = SurveyJob.failed_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF ).order_by("pk") paginator = Paginator(failed_jobs, 200) page = paginator.page() page_count = 0 if len(page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() while queue_capacity > 0: logger.info( "Handling page %d of failed (explicitly-marked-as-failure) survey jobs!", page_count ) handle_survey_jobs(page.object_list, queue_capacity) if page.has_next(): page = paginator.page(page.next_page_number()) page_count = page_count + 1 queue_capacity = get_capacity_for_jobs() else: break
def run_tximport_for_all_eligible_experiments(dispatch_jobs=True): """Creates a tximport job for all eligible experiments. """ eligible_experiments = (Experiment.objects.annotate( num_organisms=Count("organisms")).filter( num_organisms=1, technology="RNA-SEQ", num_processed_samples=0).prefetch_related("samples__results")) paginator = Paginator(eligible_experiments, PAGE_SIZE) page = paginator.page() # Next is to figure out how many samples were processed for # each experiment. Should be able to reuse code from salmon # cause it does this stuff. created_jobs = [] while True: creation_count = 0 for experiment in page.object_list: processor_job = run_tximport_if_eligible(experiment) if processor_job: creation_count += 1 created_jobs.append(processor_job) logger.info( "Created %d tximport jobs for experiments past the thresholds.", creation_count) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) return created_jobs
def retry_unqueued_survey_jobs() -> None: """Retry survey jobs that never made it into the Batch job queue.""" potentially_lost_jobs = SurveyJob.unqueued_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF ).order_by("created_at") paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info("Not handling unqueued survey jobs " "because there is no capacity for them.") while queue_capacity > 0: for survey_job in database_page.object_list: if send_job(SurveyJobTypes.SURVEYOR, job=survey_job, is_dispatch=True): queue_capacity -= 1 else: # Can't communicate with Batch just now, leave the job for a later loop. break if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_jobs() else: break
def requeue_samples(sample_queryset, dry_run=False): paginator = PerformantPaginator(sample_queryset, PAGE_SIZE) page = paginator.page() # Loop through the samples one by one to see if they've been # erroneously marked as processed. If so, mark them as # unprocessed, and kick off a new job so they can get # processed correctly. # Do this before deleting the computed files in case we get # interrupted. It'll be harder to tell what samples were # erroneously marked as processed. while True: counter = 0 for sample in page.object_list: if requeue_sample(sample, dry_run): counter += 1 # requeue_sample makes database calls, not a good idea to # call in a loop without a sleep. time.sleep(1) print(f"Requeued {counter} samples in that page.") if not page.has_next(): break else: page = paginator.page(page.next_page_number())
def run_tximport(): """Creates a tximport job for all eligible experiments.""" eligible_experiments = (Experiment.objects.annotate( num_organisms=Count("organisms")).filter( num_organisms=1, technology="RNA-SEQ", num_processed_samples=0).prefetch_related("samples__results")) paginator = Paginator(eligible_experiments, PAGE_SIZE) page = paginator.page() # Next is to figure out how many samples were processed for # each experiment. Should be able to reuse code from salmon # cause it does this stuff. tximport_pipeline = ProcessorPipeline.TXIMPORT while True: creation_count = 0 for experiment in page.object_list: quant_results = get_quant_results_for_experiment(experiment) if should_run_tximport(experiment, quant_results, True): processor_job = ProcessorJob() processor_job.pipeline_applied = tximport_pipeline.value processor_job.ram_amount = 8192 # This job doesn't need to run on a specific volume # but it uses the same Nomad job as Salmon jobs which # do require the volume index. processor_job.volume_index = random.choice( list(get_active_volumes())) processor_job.save() assoc = ProcessorJobOriginalFileAssociation() # Any original file linked to any sample of the # experiment will work. Tximport is somewhat special # in that it doesn't actuallhy use original files so # this is just used to point to the experiment. assoc.original_file = experiment.samples.all( )[0].original_files.all()[0] assoc.processor_job = processor_job assoc.save() creation_count += 1 try: send_job(tximport_pipeline, processor_job) except Exception: # If we cannot queue the job now the Foreman will do # it later. pass logger.info( "Created %d tximport jobs for experiments past the thresholds.", creation_count) if not page.has_next(): break else: page = paginator.page(page.next_page_number())
def queryset_page_iterator(queryset, page_size=2000): """ use the performant paginator to iterate over each page in a queryset """ paginator = PerformantPaginator(queryset, page_size) page = paginator.page() while True: yield page.object_list if not page.has_next(): break else: page = paginator.page(page.next_page_number())
def handle(self, *args, **options): """ Requeues downloader jobs for samples that haven't been processed and their original files have no no downloader jobs associated with them """ supported_microarray_platforms = [ x["platform_accession"] for x in get_supported_microarray_platforms() ] supported_rnaseq_platforms = [x.replace(" ", "") for x in get_supported_rnaseq_platforms()] all_supported_platforms = ( supported_microarray_platforms + supported_rnaseq_platforms ) # https://www.postgresql.org/docs/9.1/functions-array.html # Ensure selected samples have valid platforms samples_without_downloader = ( Sample.objects.all() .filter(platform_accession_code__in=all_supported_platforms) .annotate( original_files_count=Count("original_files"), downloader_job_count=Count("original_files__downloader_jobs"), ) .filter(is_processed=False, original_files_count__gt=0, downloader_job_count=0) ) if options.get("created_after", None): samples_without_downloader = samples_without_downloader.filter( created_at__gt=options["created_after"] ) samples_without_downloader = samples_without_downloader.prefetch_related("original_files") logger.info( "Found %d samples without downloader jobs, starting to create them now.", samples_without_downloader.count(), ) paginator = Paginator(samples_without_downloader, PAGE_SIZE) page = paginator.page() while True: for sample in page.object_list: logger.debug("Creating downloader job for a sample.", sample=sample.accession_code) create_downloader_job(sample.original_files.all()) logger.info( "Created %d new downloader jobs because their samples didn't have any.", PAGE_SIZE ) if not page.has_next(): break page = paginator.page(page.next_page_number())
def handle(self, *args, **options): samples = Sample.processed_objects.all() paginator = PerformantPaginator(samples, PAGE_SIZE) page = paginator.page() counter = 0 while True: for sample in page.object_list: counter += 1 if sample.results.count() == 0: print(sample.accession_code) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) if counter % 10000 == 0: print("Checked another 10000k samples.")
def requeue_samples(eligible_samples): paginator = Paginator(eligible_samples, PAGE_SIZE) page = paginator.page() creation_count = 0 while True: for sample in page.object_list: if create_downloader_job(sample.original_files.all(), force=True): creation_count += 1 if not page.has_next(): break else: page = paginator.page(page.next_page_number()) logger.info("Creating new downloader jobs. %d so far", creation_count) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5) return creation_count
def retry_hung_downloader_jobs() -> None: """Retry downloader jobs that were started but never finished.""" potentially_hung_jobs = ( DownloaderJob.hung_objects.filter(created_at__gt=utils.JOB_CREATED_AT_CUTOFF) .order_by("created_at") .prefetch_related("original_files__samples") ) paginator = Paginator(potentially_hung_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_downloader_jobs() if queue_capacity <= 0: logger.info( "Not handling hung (started-but-never-finished) downloader jobs " "because there is no capacity for them." ) while queue_capacity > 0: hung_jobs = utils.check_hung_jobs(database_page.object_list) if hung_jobs: logger.info( "Handling page %d of hung (started-but-never-finished) downloader jobs!", database_page_count, jobs_count=len(hung_jobs), ) handle_downloader_jobs(hung_jobs) if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_downloader_jobs() else: break
def retry_failed_processor_jobs() -> None: """Handle processor jobs that were marked as a failure. Ignores Janitor jobs since they are queued every half hour anyway.""" failed_jobs = (ProcessorJob.failed_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF).exclude( pipeline_applied="JANITOR").order_by( "created_at").prefetch_related("original_files__samples")) paginator = Paginator(failed_jobs, 200, "created_at") page = paginator.page() page_count = 0 if len(page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info( "Not handling failed (explicitly-marked-as-failure) processor jobs " "because there is no capacity for them.") if queue_capacity > 0: for i in range(queue_capacity): logger.info( "Handling page %d of failed (explicitly-marked-as-failure) processor jobs!", page_count, ) handle_processor_jobs(page.object_list, queue_capacity) if page.has_next(): page = paginator.page(page.next_page_number()) page_count = page_count + 1 queue_capacity = get_capacity_for_jobs() else: break
def handle(self, *args, **options): timed_out_jobs = ProcessorJob.objects.filter( success="f", failure_reason= "Salmon timed out because it failed to complete within 3 hours.", retried_job_id__isnull=True, # Only get jobs that weren't retried. ) total = 0 i = 0 paginator = Paginator(timed_out_jobs, 1000) page = paginator.page() while page: for processor_job in page.object_list: # Reset the job so it'll start at 12288 RAM (since it'll go up from here). processor_job.ram_amount = 8192 # And so it'll be retried twice more. processor_job.num_retries = 0 processor_job.retried = False # We don't actually have to send this off to Batch ourselves. # The Foreman will find it and requeue it for us! processor_job.save() if page.has_next(): page = paginator.page(page.next_page_number()) else: break total += 1 i += 1 # Only queue 300 of these an hour so we don't overload ENA. if i == 300: logger.info( "Requeued 300 more jobs (total %d). Sleeping for 1 hour.", total) time.sleep(60 * 60) i = 0
def retry_lost_survey_jobs() -> None: """Retry survey jobs that were started but never finished.""" potentially_lost_jobs = SurveyJob.lost_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF ).order_by("created_at") paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info( "Not handling lost (never-started) survey jobs " "because there is no capacity for them." ) while queue_capacity > 0: lost_jobs = utils.check_lost_jobs(database_page.object_list) if lost_jobs: logger.info( "Handling page %d of lost (never-started) survey jobs!", database_page_count, jobs_count=len(lost_jobs), ) handle_survey_jobs(lost_jobs) if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_jobs() else: break
def retry_lost_processor_jobs() -> None: """Retry processor jobs that were started but never finished.""" potentially_lost_jobs = (ProcessorJob.lost_objects.filter( created_at__gt=utils.JOB_CREATED_AT_CUTOFF).exclude( pipeline_applied="JANITOR").order_by( "created_at").prefetch_related("original_files__samples")) paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at") database_page = paginator.page() database_page_count = 0 if len(database_page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_jobs() if queue_capacity <= 0: logger.info("Not handling lost (never-started) processor jobs " "because there is no capacity for them.") while queue_capacity > 0: lost_jobs = utils.check_lost_jobs(database_page.object_list) if lost_jobs: logger.info( "Handling page %d of lost (never-started) processor jobs!", database_page_count, jobs_count=len(lost_jobs), ) handle_processor_jobs(lost_jobs) if database_page.has_next(): database_page = paginator.page(database_page.next_page_number()) database_page_count += 1 queue_capacity = get_capacity_for_jobs() else: break
def retry_failed_downloader_jobs() -> None: """Handle downloader jobs that were marked as a failure.""" failed_jobs = ( DownloaderJob.failed_objects.filter(created_at__gt=utils.JOB_CREATED_AT_CUTOFF) .order_by("created_at") .prefetch_related("original_files__samples") ) paginator = Paginator(failed_jobs, utils.PAGE_SIZE, "created_at") page = paginator.page() page_count = 0 if len(page.object_list) <= 0: # No failed jobs, nothing to do! return queue_capacity = get_capacity_for_downloader_jobs() if queue_capacity <= 0: logger.info( "Not handling failed (explicitly-marked-as-failure) downloader jobs " "because there is no capacity for them." ) while queue_capacity > 0: logger.info( "Handling page %d of failed (explicitly-marked-as-failure) downloader jobs!", page_count ) handle_downloader_jobs(page.object_list) if page.has_next(): page = paginator.page(page.next_page_number()) page_count = page_count + 1 queue_capacity = get_capacity_for_downloader_jobs() else: break
def handle(self, *args, **options): """Refreshes the metadata for all experiments, or experiments from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: experiments = Experiment.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] experiments = Experiment.objects.filter( source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: logger.debug("Refreshing metadata for an experiment.", experiment=experiment.accession_code) try: if experiment.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( experiment.samples.first().accession_code) SraSurveyor._apply_metadata_to_experiment( experiment, metadata) elif experiment.source_database == "GEO": gse = GEOparse.get_GEO( experiment.accession_code, destdir="/tmp/management", silent=True, ) GeoSurveyor._apply_metadata_to_experiment( experiment, gse) elif experiment.source_database == "ARRAY_EXPRESS": request_url = EXPERIMENTS_URL + experiment.accession_code experiment_request = utils.requests_retry_session( ).get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error( "Remote experiment has no Experiment data!", experiment_accession_code=experiment. accession_code, survey_job=self.survey_job.id, ) continue ArrayExpressSurveyor._apply_metadata_to_experiment( experiment, parsed_json) experiment.save() # If there are any errors, just continue. It's likely that it's # just a problem with this experiment. except Exception: logger.exception( "exception caught while updating metadata for {}". format(experiment.accession_code)) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)
def handle(self, *args, **options): """Re-surveys GEO experiments containing samples with incorrect platform information. """ # Check against CDF corrected accessions table to prevent recorrection of the same samples. corrected_experiments = CdfCorrectedAccession.objects.all().values( "accession_code") gse_experiments = Experiment.objects.filter( source_database="GEO").exclude( accession_code__in=corrected_experiments) paginator = Paginator(gse_experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: try: gse = GEOparse.get_GEO(experiment.accession_code, destdir=GEO_TEMP_DIR, how="brief", silent=True) sample_accessions = list(gse.gsms.keys()) samples = Sample.objects.filter( accession_code__in=sample_accessions) wrong_platform = False for sample in samples: gpl = gse.gsms[ sample.accession_code].metadata["platform_id"][0] internal_accession = get_internal_microarray_accession( gpl) if internal_accession != sample.platform_accession_code: wrong_platform = True break if wrong_platform: if options["dry_run"]: logger.info( "Would have re-surveyed experiment with accession code %s", experiment.accession_code, ) else: logger.info( "Re-surveying experiment with accession code %s", experiment.accession_code, ) purge_experiment(experiment.accession_code) queue_surveyor_for_accession( experiment.accession_code) current_time = timezone.now() CdfCorrectedAccession( accession_code=experiment.accession_code, created_at=current_time).save() except Exception: logger.exception("Caught an exception with %s!", experiment.accession_code) finally: # GEOparse downloads files here and never cleans them up! Grrrr! download_path = GEO_TEMP_DIR + experiment.accession_code + "_family.soft.gz" # It's not a directory, but ignore_errors is useful. try: os.remove(download_path) except Exception: # Don't anything interrupt this, like say, # GEOParse downloading a directory instead of # a file... logger.exception("Failed to delete an archive.") if not page.has_next(): break page = paginator.page(page.next_page_number())
def handle(self, *args, **options): """Refreshes the metadata for all samples, or samples from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: samples = Sample.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] samples = Sample.objects.filter(source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(samples, PAGE_SIZE) page = paginator.page() while True: for sample in samples: logger.debug("Refreshing metadata for a sample.", sample=sample.accession_code) if sample.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( sample.accession_code) SraSurveyor._apply_harmonized_metadata_to_sample( sample, metadata) elif sample.source_database == "GEO": gse = GEOparse.get_GEO( sample.experiments.first().accession_code, destdir="/tmp/management", how="brief", silent=True, ) preprocessed_samples = harmony.preprocess_geo( gse.gsms.items()) harmonized_samples = harmony.harmonize( preprocessed_samples) GeoSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) elif sample.source_database == "ARRAY_EXPRESS": SDRF_URL_TEMPLATE = ( "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" ) sdrf_url = SDRF_URL_TEMPLATE.format( code=sample.experiments.first().accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) ArrayExpressSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) sample.save() if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)