Beispiel #1
0
    def calculate_dashboard_stats(cls, range_param):
        """ The dashboard doesn't need all of the stats, and we can
        significantly reduce the request time by only crunching the stats the
        dashboard cares about"""
        data = {}
        data["generated_on"] = timezone.now()
        data["survey_jobs"] = cls._get_job_stats(SurveyJob.objects,
                                                 range_param)
        data["downloader_jobs"] = cls._get_job_stats(DownloaderJob.objects,
                                                     range_param)
        data["processor_jobs"] = cls._get_job_stats(ProcessorJob.objects,
                                                    range_param)
        data["experiments"] = cls._get_object_stats(Experiment.objects,
                                                    range_param)

        # processed and unprocessed samples stats
        data["unprocessed_samples"] = cls._get_object_stats(
            Sample.objects.filter(is_processed=False), range_param,
            "last_modified")
        data["processed_samples"] = cls._get_object_stats(
            Sample.processed_objects, range_param, "last_modified")

        data["active_volumes"] = list(get_active_volumes())
        data["dataset"] = cls._get_dataset_stats(range_param)

        data.update(get_nomad_jobs_breakdown())
        return data
Beispiel #2
0
def run_tximport():
    """Creates a tximport job for all eligible experiments."""
    eligible_experiments = (Experiment.objects.annotate(
        num_organisms=Count("organisms")).filter(
            num_organisms=1, technology="RNA-SEQ",
            num_processed_samples=0).prefetch_related("samples__results"))

    paginator = Paginator(eligible_experiments, PAGE_SIZE)
    page = paginator.page()

    # Next is to figure out how many samples were processed for
    # each experiment. Should be able to reuse code from salmon
    # cause it does this stuff.
    tximport_pipeline = ProcessorPipeline.TXIMPORT

    while True:
        creation_count = 0

        for experiment in page.object_list:
            quant_results = get_quant_results_for_experiment(experiment)

            if should_run_tximport(experiment, quant_results, True):
                processor_job = ProcessorJob()
                processor_job.pipeline_applied = tximport_pipeline.value
                processor_job.ram_amount = 8192
                # This job doesn't need to run on a specific volume
                # but it uses the same Nomad job as Salmon jobs which
                # do require the volume index.
                processor_job.volume_index = random.choice(
                    list(get_active_volumes()))
                processor_job.save()

                assoc = ProcessorJobOriginalFileAssociation()
                # Any original file linked to any sample of the
                # experiment will work. Tximport is somewhat special
                # in that it doesn't actuallhy use original files so
                # this is just used to point to the experiment.
                assoc.original_file = experiment.samples.all(
                )[0].original_files.all()[0]
                assoc.processor_job = processor_job
                assoc.save()

                creation_count += 1

                try:
                    send_job(tximport_pipeline, processor_job)
                except Exception:
                    # If we cannot queue the job now the Foreman will do
                    # it later.
                    pass

        logger.info(
            "Created %d tximport jobs for experiments past the thresholds.",
            creation_count)

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())
Beispiel #3
0
def retry_hung_processor_jobs() -> None:
    """Retry processor jobs that were started but never finished.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    potentially_hung_jobs = ProcessorJob.objects.filter(
        success=None,
        retried=False,
        end_time=None,
        start_time__isnull=False,
        no_retry=False,
        volume_index__in=active_volumes).exclude(
            pipeline_applied="JANITOR").prefetch_related(
                "original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    hung_jobs = []
    for job in potentially_hung_jobs:
        try:
            job_status = nomad_client.job.get_job(job.nomad_job_id)["Status"]
            if job_status != "running":
                # Make sure it didn't finish since our original query.
                job.refresh_from_db()
                if job.end_time is None:
                    hung_jobs.append(job)
        except URLNotFoundNomadException:
            hung_jobs.append(job)
        except TypeError:
            # Almost certainly a python-nomad issue:
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/job.py", line 63, in get_job
            #   return self.request(id, method="get").json()
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 74, in request
            #   endpoint = self._endpoint_builder(self.ENDPOINT, *args)
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 28, in _endpoint_builder
            #   u = "/".join(args)
            # TypeError: sequence item 1: expected str instance, NoneType found
            logger.info("Couldn't query Nomad about Processor Job.",
                        processor_job=job.id)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Processor Job.",
                             processor_job=job.id)

    if hung_jobs:
        logger.info(
            "Handling hung (started-but-never-finished) processor jobs!",
            len_jobs=len(hung_jobs))
        handle_processor_jobs(hung_jobs)
Beispiel #4
0
    def handle(self, *args, **options):
        """Requeues all unprocessed RNA-Seq samples for an organism.
        """
        if options["organism_name"] is None:
            logger.error("You must specify an organism-name.")
            sys.exit(1)
        else:
            organism_name = options["organism_name"]

        organism = Organism.objects.get(name=organism_name)

        prioritized_job_list = build_prioritized_jobs_list(organism)

        if not len(prioritized_job_list):
            logger.info(
                "Found no samples that need to be processed. I guess I'm done!"
            )
            sys.exit(0)

        logger.info(
            "Found %d samples that need to be processed. Beginning to queue jobs!",
            len(prioritized_job_list),
        )

        nomad_host = get_env_variable("NOMAD_HOST")
        nomad_port = get_env_variable("NOMAD_PORT", "4646")
        nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)

        while len(prioritized_job_list) > 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

            num_short_from_max = MAX_JOBS_FOR_THIS_MODE - len_all_jobs
            if num_short_from_max > 0:
                # We don't want these jobs to sit in our queue because
                # the volume we assigned isn't available, so only use
                # active volumes. Also in order to spread them around
                # do so randomly. We don't want to hammer Nomad to
                # get the active volumes though, so just do it once
                # per 5 minute loop.
                volume_index = random.choice(list(get_active_volumes()))
                for i in range(num_short_from_max):
                    if len(prioritized_job_list) > 0:
                        requeue_job(prioritized_job_list.pop(0), volume_index)

            # Wait 5 minutes in between queuing additional work to
            # give it time to actually get done.
            if len(prioritized_job_list) > 0:
                logger.info("Sleeping for 5 minutes while jobs get done.")
                time.sleep(300)

        logger.info(
            "Successfully requeued all jobs for unprocessed %s samples.",
            organism_name)
Beispiel #5
0
def retry_failed_processor_jobs() -> None:
    """Handle processor jobs that were marked as a failure.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    failed_jobs = ProcessorJob.objects.filter(
        success=False, retried=False, volume_index__in=active_volumes).exclude(
            pipeline_applied="JANITOR").prefetch_related(
                "original_files__samples")

    failed_jobs_list = [job for job in failed_jobs]

    if failed_jobs_list:
        logger.info(
            "Handling failed (explicitly-marked-as-failure) processor jobs!",
            len_jobs=len(failed_jobs_list))
        handle_processor_jobs(failed_jobs_list)
Beispiel #6
0
def send_janitor_jobs():
    """Dispatch a Janitor job for each instance in the cluster"""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    for volume_index in active_volumes:
        new_job = ProcessorJob(num_retries=0,
                               pipeline_applied="JANITOR",
                               ram_amount=2048,
                               volume_index=volume_index)
        new_job.save()
        logger.info("Sending Janitor with index: ",
                    job_id=new_job.id,
                    index=volume_index)
        try:
            send_job(ProcessorPipeline["JANITOR"],
                     job=new_job,
                     is_dispatch=True)
        except Exception as e:
            # If we can't dispatch this job, something else has gone wrong.
            continue
Beispiel #7
0
def cleanup_the_queue():
    """This cleans up any jobs which cannot currently be queued.

    We often have more volumes than instances because we have enough
    volumes for the scenario where the entire cluster is using the
    smallest instance type, however that doesn't happen very
    often. Therefore it's possible for some volumes to not be mounted,
    which means that jobs which are constrained to run on instances
    with those volumes cannot be placed and just clog up the queue.

    Therefore we clear out jobs of that type every once in a while so
    our queue is dedicated to jobs that can actually be placed.
    """
    # Smasher and QN Reference jobs aren't tied to a specific EBS volume.
    indexed_job_types = [
        e.value for e in ProcessorPipeline
        if e.value not in ["SMASHER", "QN_REFERENCE"]
    ]

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = nomad.Nomad(nomad_host, port=int(nomad_port), timeout=30)

    try:
        active_volumes = get_active_volumes()
        jobs = nomad_client.jobs.get_jobs()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        return

    jobs_to_kill = []
    for job in jobs:
        # Skip over the Parameterized Jobs because we need those to
        # always be running.
        if "ParameterizedJob" not in job or not job["ParameterizedJob"]:
            continue

        for job_type in indexed_job_types:
            # We're only concerned with jobs that have to be tied to a volume index.
            if "ParentID" not in job or not job["ParentID"].startswith(
                    job_type):
                continue

            # If this job has an index, then its ParentID will
            # have the pattern of <job-type>_<index>_<RAM-amount>
            # and we want to check the value of <index>:
            split_parent_id = job["ParentID"].split("_")
            if len(split_parent_id) < 2:
                continue
            else:
                index = split_parent_id[-2]

            if index not in active_volumes:
                # The index for this job isn't currently mounted, kill
                # the job and decrement the retry counter (since it
                # will be incremented when it is requeued).
                try:
                    nomad_client.job.deregister_job(job["ID"], purge=True)
                    job_record = ProcessorJob(nomad_job_id=job["ID"])
                    job_record.num_retries = job_record.num_retries - 1
                    job_record.save()
                except:
                    # If we can't do this for some reason, we'll get it next loop.
                    pass
Beispiel #8
0
def retry_lost_processor_jobs() -> None:
    """Retry processor jobs which never even got started for too long.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    potentially_lost_jobs = ProcessorJob.objects.filter(
        success=None,
        retried=False,
        start_time=None,
        end_time=None,
        no_retry=False,
        volume_index__in=active_volumes).exclude(
            pipeline_applied="JANITOR").prefetch_related(
                "original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=5)
    lost_jobs = []
    for job in potentially_lost_jobs:
        try:
            if job.nomad_job_id:
                job_status = nomad_client.job.get_job(
                    job.nomad_job_id)["Status"]
                # If the job is still pending, then it makes sense that it
                # hasn't started and if it's running then it may not have
                # been able to mark the job record as started yet.
                if job_status != "pending" and job_status != "running":
                    logger.debug((
                        "Determined that a processor job needs to be requeued because its"
                        " Nomad Job's status is: %s."),
                                 job_status,
                                 job_id=job.id)
                    lost_jobs.append(job)
            else:
                # If there is no nomad_job_id field set, we could be
                # in the small window where the job was created but
                # hasn't yet gotten a chance to be queued.
                # If this job really should be restarted we'll get it in the next loop.
                if timezone.now() - job.created_at > MIN_LOOP_TIME:
                    lost_jobs.append(job)
        except URLNotFoundNomadException:
            logger.debug((
                "Determined that a processor job needs to be requeued because "
                "querying for its Nomad job failed: "),
                         job_id=job.id)
            lost_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Processor Job.",
                             processor_job=job.id)

    if lost_jobs:
        logger.info("Handling lost (never-started) processor jobs!",
                    len_jobs=len(lost_jobs))
        handle_processor_jobs(lost_jobs)