def handle_survey_jobs(jobs: List[SurveyJob]) -> None: """For each job in jobs, either retry it or log it.""" nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) # Maximum number of total jobs running at a time. # We do this now rather than import time for testing purposes. MAX_TOTAL_JOBS = int( get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS)) len_all_jobs = len(nomad_client.jobs.get_jobs()) if len_all_jobs >= MAX_TOTAL_JOBS: logger.info("Not requeuing job until we're running fewer jobs.") return False jobs_dispatched = 0 for count, job in enumerate(jobs): if job.num_retries < MAX_NUM_RETRIES: requeue_survey_job(job) jobs_dispatched = jobs_dispatched + 1 else: handle_repeated_failure(job) if (count % 100) == 0: len_all_jobs = len(nomad_client.jobs.get_jobs()) if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS: logger.info( "We hit the maximum total jobs ceiling, so we're not handling any more survey jobs now." ) return False return True
def retry_hung_downloader_jobs() -> None: """Retry downloader jobs that were started but never finished.""" potentially_hung_jobs = DownloaderJob.objects.filter( success=None, retried=False, end_time=None, start_time__isnull=False, no_retry=False).prefetch_related("original_files__samples") nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) hung_jobs = [] for job in potentially_hung_jobs: try: job_status = nomad_client.job.get_job(job.nomad_job_id)["Status"] if job_status != "running": # Make sure it didn't finish since our original query. job.refresh_from_db() if job.end_time is None: hung_jobs.append(job) except URLNotFoundNomadException: hung_jobs.append(job) except nomad.api.exceptions.BaseNomadException: raise except Exception: logger.exception("Couldn't query Nomad about Downloader Job.", downloader_job=job.id) if hung_jobs: logger.info( "Handling hung (started-but-never-finished) downloader jobs!", jobs_count=len(hung_jobs)) handle_downloader_jobs(hung_jobs)
def retry_hung_processor_jobs() -> None: """Retry processor jobs that were started but never finished. Ignores Janitor jobs since they are queued every half hour anyway.""" try: active_volumes = get_active_volumes() except: # If we cannot reach Nomad now then we can wait until a later loop. pass potentially_hung_jobs = ProcessorJob.objects.filter( success=None, retried=False, end_time=None, start_time__isnull=False, no_retry=False, volume_index__in=active_volumes).exclude( pipeline_applied="JANITOR").prefetch_related( "original_files__samples") nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) hung_jobs = [] for job in potentially_hung_jobs: try: job_status = nomad_client.job.get_job(job.nomad_job_id)["Status"] if job_status != "running": # Make sure it didn't finish since our original query. job.refresh_from_db() if job.end_time is None: hung_jobs.append(job) except URLNotFoundNomadException: hung_jobs.append(job) except TypeError: # Almost certainly a python-nomad issue: # File "/usr/local/lib/python3.5/dist-packages/nomad/api/job.py", line 63, in get_job # return self.request(id, method="get").json() # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 74, in request # endpoint = self._endpoint_builder(self.ENDPOINT, *args) # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 28, in _endpoint_builder # u = "/".join(args) # TypeError: sequence item 1: expected str instance, NoneType found logger.info("Couldn't query Nomad about Processor Job.", processor_job=job.id) except nomad.api.exceptions.BaseNomadException: raise except Exception: logger.exception("Couldn't query Nomad about Processor Job.", processor_job=job.id) if hung_jobs: logger.info( "Handling hung (started-but-never-finished) processor jobs!", len_jobs=len(hung_jobs)) handle_processor_jobs(hung_jobs)
def handle(self, *args, **options): """Requeues all unprocessed RNA-Seq samples for an organism. """ if options["organism_name"] is None: logger.error("You must specify an organism-name.") sys.exit(1) else: organism_name = options["organism_name"] organism = Organism.objects.get(name=organism_name) prioritized_job_list = build_prioritized_jobs_list(organism) if not len(prioritized_job_list): logger.info( "Found no samples that need to be processed. I guess I'm done!" ) sys.exit(0) logger.info( "Found %d samples that need to be processed. Beginning to queue jobs!", len(prioritized_job_list), ) nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) while len(prioritized_job_list) > 0: len_all_jobs = len(nomad_client.jobs.get_jobs()) num_short_from_max = MAX_JOBS_FOR_THIS_MODE - len_all_jobs if num_short_from_max > 0: # We don't want these jobs to sit in our queue because # the volume we assigned isn't available, so only use # active volumes. Also in order to spread them around # do so randomly. We don't want to hammer Nomad to # get the active volumes though, so just do it once # per 5 minute loop. volume_index = random.choice(list(get_active_volumes())) for i in range(num_short_from_max): if len(prioritized_job_list) > 0: requeue_job(prioritized_job_list.pop(0), volume_index) # Wait 5 minutes in between queuing additional work to # give it time to actually get done. if len(prioritized_job_list) > 0: logger.info("Sleeping for 5 minutes while jobs get done.") time.sleep(300) logger.info( "Successfully requeued all jobs for unprocessed %s samples.", organism_name)
def kill_nomad_job(self) -> bool: if not self.nomad_job_id: return False try: nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) nomad_client.job.deregister_job(self.nomad_job_id) except nomad.api.exceptions.BaseNomadException: return False return True
def retry_lost_survey_jobs() -> None: """Retry survey jobs which never even got started for too long.""" potentially_lost_jobs = SurveyJob.objects.filter( success=None, retried=False, start_time=None, end_time=None, no_retry=False).order_by('pk') nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) lost_jobs = [] for job in potentially_lost_jobs: try: # Surveyor jobs didn't always have nomad_job_ids. If they # don't have one then by this point they've definitely died. if job.nomad_job_id: job_status = nomad_client.job.get_job( job.nomad_job_id)["Status"] else: job_status = "absent" # If the job is still pending, then it makes sense that it # hasn't started and if it's running then it may not have # been able to mark the job record as started yet. if job_status != "pending" and job_status != "running": logger.debug(( "Determined that a survey job needs to be requeued because its" " Nomad Job's status is: %s."), job_status, job_id=job.id) lost_jobs.append(job) except URLNotFoundNomadException: logger.debug( ("Determined that a survey job needs to be requeued because " "querying for its Nomad job failed."), job_id=job.id) lost_jobs.append(job) except nomad.api.exceptions.BaseNomadException: raise except Exception: logger.exception("Couldn't query Nomad about Processor Job.", survey_job=job.id) if lost_jobs: logger.info("Handling lost (never-started) survey jobs!", len_jobs=len(lost_jobs)) handle_survey_jobs(lost_jobs)
def send_job(job_type: Enum, job_id: int) -> None: """Queues a worker job by sending a Nomad Job dispatch message. job_type must be a valid Enum for ProcessorPipelines or Downloaders as defined in data_refinery_common.job_lookup. job_id must correspond to an existing ProcessorJob or DownloaderJob record. """ nomad_host = get_env_variable("NOMAD_HOST") nomad_client = nomad.Nomad(nomad_host, timeout=5) # Once I have every job specced out with its own Nomad job, this # code can change and the meta won't need "JOB_NAME" in it because # the just specifying the nomad_job to dispatch will be enough. if job_type in list(ProcessorPipeline): nomad_job = NOMAD_PROCESSOR_JOB elif job_type in list(Downloaders): nomad_job = NOMAD_DOWNLOADER_JOB else: raise ValueError("Invalid job_type.") logger.info("Queuing %s nomad job to run DR job %s with id %d.", nomad_job, job_type.value, job_id) nomad_client.job.dispatch_job(nomad_job, meta={"JOB_NAME": job_type.value, "JOB_ID": str(job_id)})
def get_latest_organism_index(organism): # Salmon version gets saved as what salmon outputs, which includes this prefix. current_salmon_version = "salmon " + get_env_variable( "SALMON_VERSION", "0.13.1") return (OrganismIndex.objects.filter( salmon_version=current_salmon_version, organism=organism).order_by("-created_at").first())
def handle(self, *args, **options): """Main function for this command. Basically does what is described at the top of this file. """ # Create working dir LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store") work_dir = LOCAL_ROOT_DIR + "/affy_correction/" os.makedirs(work_dir, exist_ok=True) for sample in Sample.objects.filter(technology="RNA-SEQ", source_database="GEO"): for original_file in sample.original_files.all(): if original_file.is_affy_data(): input_file_path = work_dir + original_file.source_filename download_success = _download_file(original_file.source_url, input_file_path) if download_success: try: brainarray_package = _determine_brainarray_package( input_file_path) if brainarray_package: logger.info( "Determined the package for sample %d is: " + brainarray_package, sample.id, ) # If we've detected the platform using affy, then this # is the best source of truth we'll be able to get, so # update the sample to match it. platform_name = get_readable_affymetrix_names( )[brainarray_package] sample.platform_accession_code = brainarray_package sample.platform_name = platform_name except: logger.exception( "Failed to detect platform from downloaded file %s.", input_file_path, ) # Regardless of whether we could detect the # platform successfully or not, we definitely know # it's an Affymetrix Microarray because that's the # only one that makes .CEL files. sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.save() # If there's other original files associated with # this sample, we don't need them because we # already corrected the platform. break # Cleanup after ourselves: shutil.rmtree(work_dir)
def handle_processor_jobs(jobs: List[ProcessorJob]) -> None: """For each job in jobs, either retry it or log it.""" nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) # Maximum number of total jobs running at a time. # We do this now rather than import time for testing purposes. MAX_TOTAL_JOBS = int( get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS)) len_all_jobs = len(nomad_client.jobs.get_jobs()) if len_all_jobs >= MAX_TOTAL_JOBS: logger.info("Not requeuing job until we're running fewer jobs.") return False # We want zebrafish data first, then hgu133plus2, then data # related to pediatric cancer, then to finish salmon experiments # that are close to completion. # Each function moves the jobs it prioritizes to the front of the # list, so apply them in backwards order. # jobs = prioritize_salmon_jobs(jobs) # jobs = prioritize_jobs_by_accession(jobs, PEDIATRIC_ACCESSION_LIST) # jobs = prioritize_jobs_by_accession(jobs, HGU133PLUS2_ACCESSION_LIST) # jobs = prioritize_zebrafish_jobs(jobs) jobs_dispatched = 0 for count, job in enumerate(jobs): if job.num_retries < MAX_NUM_RETRIES: requeue_processor_job(job) jobs_dispatched = jobs_dispatched + 1 else: handle_repeated_failure(job) if (count % 100) == 0: len_all_jobs = len(nomad_client.jobs.get_jobs()) if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS: logger.info( "We hit the maximum total jobs ceiling, so we're not handling any more processor jobs now." ) return False return True
def test_survey(self): """Survey the given sample""" # Clear out pre-existing work dirs so there's no conflicts: self.env = EnvironmentVarGuard() self.env.set("RUNING_IN_CLOUD", "False") with self.env: for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"): shutil.rmtree(work_dir) survey_job = surveyor.survey_experiment( get_env_variable("ACCESSION"), get_env_variable("SURVEYOR")) self.assertTrue(survey_job.success) downloader_jobs = DownloaderJob.objects.all() self.assertGreater(downloader_jobs.count(), 0) logger.info( "Survey Job finished, waiting for Downloader Jobs to complete." ) start_time = timezone.now() for downloader_job in downloader_jobs: downloader_job = wait_for_job(downloader_job, DownloaderJob, start_time) self.assertTrue(downloader_job.success) processor_jobs = ProcessorJob.objects.all().exclude( abort=True) # exclude aborted processor jobs self.assertGreater(processor_jobs.count(), 0) logger.info( "Downloader Jobs finished, waiting for processor Jobs to complete." ) start_time = timezone.now() for processor_job in processor_jobs: processor_job = wait_for_job(processor_job, ProcessorJob, start_time) if not processor_job.success: logger.error(processor_job.failure_reason) self.assertTrue(processor_job.success)
def retry_hung_survey_jobs() -> None: """Retry survey jobs that were started but never finished.""" potentially_hung_jobs = SurveyJob.objects.filter( success=None, retried=False, end_time=None, start_time__isnull=False, no_retry=False).order_by('pk') nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30) hung_jobs = [] for job in potentially_hung_jobs: try: # Surveyor jobs didn't always have nomad_job_ids. If they # don't have one then by this point they've definitely died. if job.nomad_job_id: job_status = nomad_client.job.get_job( job.nomad_job_id)["Status"] else: job_status = "absent" if job_status != "running": # Make sure it didn't finish since our original query. job.refresh_from_db() if job.end_time is None: hung_jobs.append(job) except URLNotFoundNomadException: hung_jobs.append(job) except nomad.api.exceptions.BaseNomadException: raise except Exception: logger.exception("Couldn't query Nomad about SurveyJob Job.", survey_job=job.id) if hung_jobs: logger.info("Handling hung (started-but-never-finished) survey jobs!", len_jobs=len(hung_jobs)) handle_survey_jobs(hung_jobs)
def get_quant_results_for_experiment(experiment: Experiment, filter_old_versions=True): """Returns a set of salmon quant results from `experiment`.""" # Subquery to calculate quant results # https://docs.djangoproject.com/en/2.2/ref/models/expressions/#subquery-expressions all_results = ComputationalResult.objects.filter( sample__in=experiment.samples.all()) if filter_old_versions: # Salmon version gets saved as what salmon outputs, which includes this prefix. current_salmon_version = "salmon " + get_env_variable( "SALMON_VERSION", "0.13.1") organisms = experiment.organisms.all() organism_indices = OrganismIndex.objects.filter( salmon_version=current_salmon_version, organism__in=organisms) all_results = all_results.filter( organism_index__id__in=organism_indices.values("id")) all_results = all_results.prefetch_related("computedfile_set").filter( computedfile__s3_bucket__isnull=False, computedfile__s3_key__isnull=False) def get_sample_id_set(result): return {sample.id for sample in result.samples.all()} latest_results = set() for k, group in groupby(sorted(all_results, key=get_sample_id_set), get_sample_id_set): latest_result = None for result in group: if not latest_result: latest_result = result else: if result.created_at > latest_result.created_at: latest_result = result latest_results.add(latest_result) return latest_results
def get_quant_results_for_experiment(experiment: Experiment, filter_old_versions=True): """Returns a queryset of salmon quant results from `experiment`.""" # Subquery to calculate quant results # https://docs.djangoproject.com/en/2.2/ref/models/expressions/#subquery-expressions # Salmon version gets saved as what salmon outputs, which includes this prefix. current_salmon_version = "salmon " + get_env_variable("SALMON_VERSION", "0.13.1") if filter_old_versions: eligible_results = ComputationalResult.objects.prefetch_related("organism_index").filter( organism_index__salmon_version=current_salmon_version ) else: eligible_results = ComputationalResult.objects.all() # A result is only eligible to be used if it actually got uploaded. eligible_results = eligible_results.select_related("computedfile").filter( computedfile__s3_bucket__isnull=False, computedfile__s3_key__isnull=False ) # Calculate the computational results sorted that are associated with a given sample ( # referenced from the top query) newest_computational_results = eligible_results.filter( samples=OuterRef("id"), processor__name=ProcessorEnum.SALMON_QUANT.value["name"], ).order_by("-created_at") # Annotate each sample in the experiment with the id of the most recent computational result computational_results_ids = ( experiment.samples.all() .annotate( latest_computational_result_id=Subquery(newest_computational_results.values("id")[:1]) ) .filter(latest_computational_result_id__isnull=False) .values_list("latest_computational_result_id", flat=True) ) # return the computational results that match those ids return ComputationalResult.objects.all().filter(id__in=computational_results_ids)
def test_tximport(self): self.assertEqual(Processor.objects.count(), 0) # No processor yet proc_key = "TXIMPORT" tximport_processor = utils.find_processor(proc_key) self.assertEqual(Processor.objects.count(), 1) # New processor created # Validate some information of the new processor self.assertEqual(tximport_processor.name, utils.ProcessorEnum[proc_key].value['name']) self.assertEqual(tximport_processor.version, get_env_variable("SYSTEM_VERSION")) self.assertEqual(tximport_processor.docker_image, utils.ProcessorEnum[proc_key].value['docker_img']) self.assertEqual(tximport_processor.environment['os_distribution'], utils.get_os_distro()) os_pkg_name = 'r-base' self.assertEqual(tximport_processor.environment['os_pkg'][os_pkg_name], utils.get_os_pkgs([os_pkg_name])[os_pkg_name]) pip_pkg_name = 'data-refinery-common' self.assertEqual(tximport_processor.environment['python'][pip_pkg_name], utils.get_pip_pkgs([pip_pkg_name])[pip_pkg_name]) r_pkg_names = ['Bioconductor', 'tximport'] r_pkg_info = utils.get_r_pkgs(r_pkg_names) for r_pkg in r_pkg_names: self.assertEqual(tximport_processor.environment['R'][r_pkg], r_pkg_info[r_pkg]) # Confirm that there is only one processor in one runtime environment for i in range(3): proc2 = utils.find_processor(proc_key) self.assertEqual(Processor.objects.count(), 1) # No new processor self.assertEqual(tximport_processor, proc2) # Same processor instance
import rpy2.robjects as ro from rpy2.rinterface import RRuntimeError from data_refinery_common.job_lookup import PipelineEnum from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models import ( ComputationalResult, ComputedFile, Pipeline, SampleComputedFileAssociation, SampleResultAssociation, ) from data_refinery_common.utils import get_env_variable from data_refinery_workers.processors import utils S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery") logger = get_and_configure_logger(__name__) def _prepare_files(job_context: Dict) -> Dict: """Populate our job_context with appropriate inputs and outputs Also adds the keys "input_file_path" and "output_file_path" to job_context so everything is prepared for processing. """ original_file = job_context["original_files"][0] job_context["input_file_path"] = original_file.absolute_file_path # Turns /home/user/data_store/E-GEOD-8607/raw/foo.txt into /home/user/data_store/E-GEOD-8607/processed/foo.cel pre_part = original_file.absolute_file_path.split("/")[:-2] end_part = original_file.absolute_file_path.split("/")[-1]
OrganismIndex, OriginalFile, Pipeline, Processor, ProcessorJob, ) from data_refinery_common.utils import get_env_variable, get_env_variable_gracefully from data_refinery_workers.processors import utils logger = get_and_configure_logger(__name__) JOB_DIR_PREFIX = "processor_job_" GENE_TO_TRANSCRIPT_TEMPLATE = "{gene_id}\t{transcript_id}\n" GENE_TYPE_COLUMN = 2 S3_TRANSCRIPTOME_INDEX_BUCKET_NAME = get_env_variable_gracefully("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME", False) LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store") # Removes each occurrance of ; and " IDS_CLEANUP_TABLE = str.maketrans({";": None, "\"": None}) def _compute_paths(job_context: Dict) -> str: """Computes the paths for all the directories used/created by this processor. Also computes a couple other path-based properties and adds them to the job_context. """ # All files for the job are in the same directory. first_file_path = job_context["original_files"][0].absolute_file_path job_context["base_file_path"] = '/'.join(first_file_path.split('/')[:-1]) job_context["work_dir"] = job_context["base_file_path"] + '/' + job_context["length"].upper() + '/' + \ JOB_DIR_PREFIX + str(job_context["job_id"]) + "/" try:
""" import os import sys from data_refinery_common.utils import get_env_variable, get_env_variable_gracefully # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = get_env_variable("DJANGO_SECRET_KEY") # SECURITY WARNING: don't run with debug turned on in production! DEBUG = get_env_variable("DJANGO_DEBUG") == "True" ALLOWED_HOSTS = [] # Application definition INSTALLED_APPS = [ "django.contrib.admin", "django.contrib.auth", "django.contrib.contenttypes", "django.contrib.sessions", "django.contrib.messages",
from django.db import models from django.utils import timezone import requests from computedfields.models import ComputedFieldsModel, computed from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models.compendium_result import CompendiumResult from data_refinery_common.utils import get_env_variable logger = get_and_configure_logger(__name__) NCBI_ROOT_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" NCBI_API_KEY = get_env_variable( "NCBI_API_KEY", "3a1f8d818b0aa05d1aa3c334fa2cc9a17e09" ) # This is only used by eUtils and for organisms that aren't cached yet - it's harmless to share. ESEARCH_URL = NCBI_ROOT_URL + "esearch.fcgi" EFETCH_URL = NCBI_ROOT_URL + "efetch.fcgi" TAXONOMY_DATABASE = "taxonomy" class UnscientificNameError(Exception): pass class InvalidNCBITaxonomyId(Exception): pass class UnknownOrganismId(Exception):
def send_job(job_type: Enum, job, is_dispatch=False) -> bool: """Queues a worker job by sending a Nomad Job dispatch message. job_type must be a valid Enum for ProcessorPipelines or Downloaders as defined in data_refinery_common.job_lookup. job must be an existing ProcessorJob or DownloaderJob record. Returns True if the job was successfully dispatch, return False otherwise. """ nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = nomad.Nomad(nomad_host, port=int(nomad_port), timeout=30) is_processor = True if (job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG or job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT): nomad_job = NOMAD_TRANSCRIPTOME_JOB elif job_type is ProcessorPipeline.SALMON or job_type is ProcessorPipeline.TXIMPORT: # Tximport uses the same job specification as Salmon. nomad_job = ProcessorPipeline.SALMON.value elif job_type is ProcessorPipeline.AFFY_TO_PCL: nomad_job = ProcessorPipeline.AFFY_TO_PCL.value elif job_type is ProcessorPipeline.NO_OP: nomad_job = ProcessorPipeline.NO_OP.value elif job_type is ProcessorPipeline.ILLUMINA_TO_PCL: nomad_job = ProcessorPipeline.ILLUMINA_TO_PCL.value elif job_type is ProcessorPipeline.SMASHER: nomad_job = ProcessorPipeline.SMASHER.value elif job_type is ProcessorPipeline.JANITOR: nomad_job = ProcessorPipeline.JANITOR.value elif job_type is ProcessorPipeline.QN_REFERENCE: nomad_job = ProcessorPipeline.QN_REFERENCE.value elif job_type is ProcessorPipeline.CREATE_COMPENDIA: nomad_job = ProcessorPipeline.CREATE_COMPENDIA.value elif job_type is ProcessorPipeline.CREATE_QUANTPENDIA: nomad_job = ProcessorPipeline.CREATE_QUANTPENDIA.value elif job_type is ProcessorPipeline.AGILENT_TWOCOLOR_TO_PCL: # Agilent twocolor uses the same job specification as Affy. nomad_job = ProcessorPipeline.AFFY_TO_PCL.value elif job_type in list(Downloaders): nomad_job = NOMAD_DOWNLOADER_JOB is_processor = False elif job_type in list(SurveyJobTypes): nomad_job = job_type.value is_processor = False elif job_type is Downloaders.NONE: logger.warn("Not queuing %s job.", job_type, job_id=job_id) raise ValueError( NONE_JOB_ERROR_TEMPLATE.format(job_type.value, "Downloader", job_id)) elif job_type is ProcessorPipeline.NONE: logger.warn("Not queuing %s job.", job_type, job_id=job_id) raise ValueError( NONE_JOB_ERROR_TEMPLATE.format(job_type.value, "Processor", job_id)) else: raise ValueError("Invalid job_type: {}".format(job_type.value)) logger.debug("Queuing %s nomad job to run job %s with id %d.", nomad_job, job_type.value, job.id) # We only want to dispatch processor jobs directly. # Everything else will be handled by the Foreman, which will increment the retry counter. if is_processor or is_dispatch or (not settings.RUNNING_IN_CLOUD): # Smasher doesn't need to be on a specific instance since it will # download all the data to its instance anyway. if isinstance(job, ProcessorJob) and job_type not in SMASHER_JOB_TYPES: # Make sure this job goes to the correct EBS resource. # If this is being dispatched for the first time, make sure that # we store the currently attached index. # If this is being dispatched by the Foreman, it should already # have an attached volume index, so use that. if job.volume_index is None: job.volume_index = get_volume_index() job.save() nomad_job = nomad_job + "_" + job.volume_index + "_" + str( job.ram_amount) elif isinstance(job, SurveyJob): nomad_job = nomad_job + "_" + str(job.ram_amount) elif isinstance(job, DownloaderJob): volume_index = job.volume_index if settings.RUNNING_IN_CLOUD else "0" nomad_job = nomad_job + "_" + volume_index + "_" + str( job.ram_amount) try: nomad_response = nomad_client.job.dispatch_job(nomad_job, meta={ "JOB_NAME": job_type.value, "JOB_ID": str(job.id) }) job.nomad_job_id = nomad_response["DispatchedJobID"] job.save() return True except URLNotFoundNomadException: logger.info( "Dispatching Nomad job of type %s for job spec %s to host %s and port %s failed.", job_type, nomad_job, nomad_host, nomad_port, job=str(job.id), ) raise except Exception as e: logger.info( "Unable to Dispatch Nomad Job.", job_name=job_type.value, job_id=str(job.id), reason=str(e), ) raise else: job.num_retries = job.num_retries - 1 job.save() return True
from django.utils import timezone import boto3 import pandas as pd import psutil import requests from botocore.exceptions import ClientError from sklearn import preprocessing from data_refinery_common.enums import PipelineEnum from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models import ComputedFile, Pipeline from data_refinery_common.utils import calculate_file_size, calculate_sha1, get_env_variable from data_refinery_workers.processors import smashing_utils, utils RESULTS_BUCKET = get_env_variable("S3_RESULTS_BUCKET_NAME", "refinebio-results-bucket") S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery") AWS_REGION = get_env_variable( "AWS_REGION", "us-east-1") # Default to us-east-1 if the region variable can't be found BODY_HTML = (Path("data_refinery_workers/processors/smasher_email.min.html"). read_text().replace("\n", "")) BODY_ERROR_HTML = ( Path("data_refinery_workers/processors/smasher_email_error.min.html" ).read_text().replace("\n", "")) BYTES_IN_GB = 1024 * 1024 * 1024 logger = get_and_configure_logger(__name__) ### DEBUG ### logger.setLevel(logging.getLevelName("DEBUG")) PROCESS_POOL_SIZE = max(1, int(psutil.cpu_count() / 2 - 1))
def test_all_endpoints(self): response = self.client.get( reverse("experiments", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response["X-Source-Revision"], get_env_variable("SYSTEM_VERSION")) response = self.client.get( reverse("samples", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("samples", kwargs={"version": API_VERSION}), {"ids": str(self.sample.id) + ",1000"}, ) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("samples", kwargs={"version": API_VERSION}), {"accession_codes": str(self.sample.accession_code) + ",1000"}, ) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("organisms", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("organisms", kwargs={"version": API_VERSION}) + "HOMO_SAPIENS/") self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("platforms", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("institutions", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("survey_jobs", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("downloader_jobs", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) # Don't know the best way to deal with this, but since the other tests in different files # create objects which are then deleted, the new objects from these tests will have different # IDs. In this case, since this file is ran first, the IDs are 1, but this may be a problem # in the future. response = self.client.get( reverse("downloader_jobs", kwargs={"version": API_VERSION}) + "1/" # change back ) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("processor_jobs", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("processor_jobs", kwargs={"version": API_VERSION}) + "1/") self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("stats", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("results", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("results", kwargs={"version": API_VERSION}) + "1/") self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("schema_redoc", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("transcriptome_indices", kwargs={"version": API_VERSION}) + "?organism__name=DANIO_RERIO") self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("transcriptome_indices", kwargs={"version": API_VERSION}) + "?result_id=1") self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("search", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("transcriptome_indices", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get( reverse("create_dataset", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED)
import signal import sys from django.utils import timezone from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models import DownloaderJob, DownloaderJobOriginalFileAssociation from data_refinery_common.utils import get_env_variable, get_instance_id logger = get_and_configure_logger(__name__) # Let this fail if SYSTEM_VERSION is unset. SYSTEM_VERSION = get_env_variable("SYSTEM_VERSION") CURRENT_JOB = None def signal_handler(sig, frame): """Signal Handler, works for both SIGTERM and SIGINT""" global CURRENT_JOB if CURRENT_JOB: CURRENT_JOB.success = False CURRENT_JOB.end_time = timezone.now() CURRENT_JOB.num_retries = CURRENT_JOB.num_retries - 1 CURRENT_JOB.failure_reason = "Interruped by SIGTERM/SIGINT: " + str(sig) CURRENT_JOB.save() sys.exit(0) def start_job(job_id: int) -> DownloaderJob: """Record in the database that this job is being started.
Processor, ProcessorJob, ProcessorJobDatasetAssociation, ProcessorJobOriginalFileAssociation, Sample, ) from data_refinery_common.utils import ( get_env_variable, get_env_variable_gracefully, get_instance_id, ) logger = get_and_configure_logger(__name__) # Let this fail if SYSTEM_VERSION is unset. SYSTEM_VERSION = get_env_variable("SYSTEM_VERSION") S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery") DIRNAME = os.path.dirname(os.path.abspath(__file__)) CURRENT_JOB = None def signal_handler(sig, frame): """Signal Handler, works for both SIGTERM and SIGINT""" global CURRENT_JOB if not CURRENT_JOB: sys.exit(0) else: CURRENT_JOB.start_time = None CURRENT_JOB.num_retries = CURRENT_JOB.num_retries - 1 CURRENT_JOB.failure_reason = "Caught either a SIGTERM or SIGINT signal." CURRENT_JOB.success = False
from data_refinery_common.enums import PipelineEnum from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models import ( ComputationalResult, ComputedFile, Pipeline, SampleAnnotation, SampleComputedFileAssociation, SampleResultAssociation, ) from data_refinery_common.utils import get_env_variable, get_internal_microarray_accession from data_refinery_workers.processors import utils logger = get_and_configure_logger(__name__) LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store") S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery") def _prepare_files(job_context: Dict) -> Dict: """A processor which takes externally-processed sample data and makes it smashable. """ try: original_file = job_context["original_files"][0] sample0 = job_context["samples"][0] if sample0.manufacturer == "ILLUMINA": job_context["is_illumina"] = True else: job_context["is_illumina"] = False # All files for the job are in the same directory.
def test_all_endpoints(self): response = self.client.get(reverse("experiments", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response["X-Source-Revision"], get_env_variable("SYSTEM_VERSION")) cache.clear() response = self.client.get(reverse("samples", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get( reverse("samples", kwargs={"version": API_VERSION}), {"ids": str(self.sample.id) + ",1000"}, ) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get( reverse("samples", kwargs={"version": API_VERSION}), {"accession_codes": str(self.sample.accession_code) + ",1000"}, ) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("organisms", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get( reverse("organisms", kwargs={"version": API_VERSION}) + "HOMO_SAPIENS/" ) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("platforms", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("institutions", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("survey_jobs", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertFalse(response.data["results"][0]["is_queued"]) cache.clear() response = self.client.get( reverse("survey_jobs", kwargs={"version": API_VERSION}) + "1/" # change back ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertFalse(response.data["is_queued"]) cache.clear() response = self.client.get(reverse("downloader_jobs", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertFalse(response.data["results"][0]["is_queued"]) cache.clear() # Don't know the best way to deal with this, but since the # other tests in different files create objects which are then # deleted, the new objects from these tests will have # different IDs. In this case, since this file is ran first, # the IDs are 1, but this may be a problem in the future. response = self.client.get( reverse("downloader_jobs", kwargs={"version": API_VERSION}) + "1/" # change back ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertFalse(response.data["is_queued"]) cache.clear() response = self.client.get(reverse("processor_jobs", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertFalse(response.data["results"][0]["is_queued"]) cache.clear() response = self.client.get( reverse("processor_jobs", kwargs={"version": API_VERSION}) + "1/" ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertFalse(response.data["is_queued"]) cache.clear() response = self.client.get(reverse("stats", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("results", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("results", kwargs={"version": API_VERSION}) + "1/") self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("schema_redoc", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get( reverse("transcriptome_indices", kwargs={"version": API_VERSION}) + "?organism__name=DANIO_RERIO" ) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get( reverse("transcriptome_indices", kwargs={"version": API_VERSION}) + "?result_id=1" ) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get(reverse("search", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_200_OK) cache.clear() response = self.client.get( reverse("transcriptome_indices", kwargs={"version": API_VERSION}) ) self.assertEqual(response.status_code, status.HTTP_200_OK) response = self.client.get(reverse("create_dataset", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED) response = self.client.get(reverse("samples", kwargs={"version": API_VERSION}) + "?foo=bar") self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertListEqual(response.json()["details"], ["foo"]) # Tenth call since reset_cache() should be throttled, three have happened. # Make more than necessary to ensure we get the throttle. for i in range(15): response = self.client.get(reverse("survey_jobs", kwargs={"version": API_VERSION})) self.assertEqual(response.status_code, status.HTTP_429_TOO_MANY_REQUESTS)
from django.utils import timezone import boto3 import pandas as pd import psutil import requests from botocore.exceptions import ClientError from sklearn import preprocessing from data_refinery_common.job_lookup import PipelineEnum from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models import ComputedFile, Pipeline from data_refinery_common.utils import calculate_file_size, calculate_sha1, get_env_variable from data_refinery_workers.processors import smashing_utils, utils RESULTS_BUCKET = get_env_variable("S3_RESULTS_BUCKET_NAME", "refinebio-results-bucket") S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery") BODY_HTML = (Path("data_refinery_workers/processors/smasher_email.min.html"). read_text().replace("\n", "")) BODY_ERROR_HTML = ( Path("data_refinery_workers/processors/smasher_email_error.min.html" ).read_text().replace("\n", "")) BYTES_IN_GB = 1024 * 1024 * 1024 logger = get_and_configure_logger(__name__) ### DEBUG ### logger.setLevel(logging.getLevelName("DEBUG")) PROCESS_POOL_SIZE = max(1, int(psutil.cpu_count() / 2 - 1)) SCALERS = { "MINMAX": preprocessing.MinMaxScaler,
def _find_and_remove_expired_jobs(job_context): """ Finds expired jobs and removes their working directories """ nomad_host = get_env_variable("NOMAD_HOST") nomad_port = get_env_variable("NOMAD_PORT", "4646") nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=15) job_context['deleted_items'] = [] for item in os.listdir(LOCAL_ROOT_DIR): # Processor job working directories if 'processor_job_' in item: # TX Index jobs are the only ones who are allowed to hang around # after their jobs are finished. They're marked with an _index in their path. if '_index' in item: continue job_id = item.split('processor_job_')[1] # Okay, does this job exist? try: job = ProcessorJob.objects.get(id=job_id) # Is this job running? try: job_status = nomad_client.job.get_job( job.nomad_job_id)["Status"] # This job is running, don't delete the working directory. if job_status == "running": continue except URLNotFoundNomadException as e: # Nomad has no record of this job, meaning it has likely been GC'd after death. # It can be purged. pass except BaseNomadException as e: # If we can't currently access Nomad, # just continue until we can again. continue except Exception as e: # This job is likely vanished. No need for this directory. # Or, possibly, another Nomad error outside of BaseNomadException. logger.exception("Janitor found vanished job for " + item + " - why?") continue except ProcessorJob.DoesNotExist: # This job has vanished from the DB - clean it up! logger.error("Janitor found no record of " + item + " - why?") pass except Exception: # We're unable to connect to the DB right now (or something), so hold onto it for right now. logger.exception("Problem finding job record for " + item + " - why?") continue # Delete it! try: to_delete = LOCAL_ROOT_DIR + '/' + item logger.info("Janitor deleting " + to_delete, contents=str(os.listdir(to_delete))) shutil.rmtree(to_delete) job_context['deleted_items'].append(to_delete) except Exception as e: # This job is likely vanished. No need for this directory. pass # There may be successful processors if 'SRP' in item or 'ERP' in item or 'DRR' in item: sub_path = os.path.join(LOCAL_ROOT_DIR, item) for sub_item in os.listdir(sub_path): try: sample = Sample.objects.get(accession_code=sub_item) if sample.computed_files.count() == 0: # This doesn't have any associated computed files - leave it be. continue except Sample.DoesNotExist: # Interesting. This shouldn't happen at all. continue except Exception: # We can't contact the DB right now, skip deletion. continue try: sub_item_path = os.path.join(sub_path, sub_item) logger.info("Janitor deleting " + sub_item_path, contents=str(os.listdir(sub_item_path))) shutil.rmtree(sub_item_path) job_context['deleted_items'].append(sub_item_path) except Exception as e: # This job is likely vanished. No need for this directory. pass job_context['success'] = True return job_context
from data_refinery_common.logging import get_and_configure_logger from data_refinery_common.models import ( ComputationalResult, ComputationalResultAnnotation, ComputedFile, Pipeline, Processor, SampleComputedFileAssociation, SampleResultAssociation, Organism ) from data_refinery_common.utils import get_env_variable from data_refinery_workers.processors import utils, smasher S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery") S3_COMPENDIA_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery-compendia") logger = get_and_configure_logger(__name__) def _prepare_input(job_context: Dict) -> Dict: # We're going to use the smasher outside of the smasher. # I'm not crazy about this yet. Maybe refactor later, # but I need the data now. job_context = smasher._prepare_files(job_context) job_context = smasher._smash(job_context, how="outer") if not 'final_frame' in job_context.keys(): logger.error("Unable to prepare files for creating compendia.", job_id=job_context['job'].id)
# Generated by Django 2.1.5 on 2019-04-04 14:27 import sys from django.conf import settings from django.db import migrations from data_refinery_common.utils import get_env_variable # We want this to throw if it can't access this, no point in running a # migration to set everything to a bad value. S3_QN_TARGET_BUCKET_NAME = get_env_variable("S3_QN_TARGET_BUCKET_NAME") def update_qn_bucket(apps, schema_editor): """Sets the s3_bucket for QN Targets to a bucket just for them. Based off of: https://simpleisbetterthancomplex.com/tutorial/2017/09/26/how-to-create-django-data-migrations.html We can't import the ComputedFile model directly as it may be a newer version than this migration expects. We use the historical version. """ if not settings.RUNNING_IN_CLOUD: return # Pagination isn't necessary here because we have very few QN targets. ComputedFile = apps.get_model("data_refinery_common", "ComputedFile") for computed_file in ComputedFile.objects.filter(is_qn_target=True): if not computed_file.s3_bucket or not computed_file.s3_key: