Ejemplo n.º 1
0
    def test_get_instance_id_local(self):
        """Test that local is used for instance id."""
        # Ensure utils.INSTANCE_ID hasn't been set yet in case the
        # order the tests are run in ever changes
        utils.INSTANCE_ID = None

        with self.settings(RUNNING_IN_CLOUD=False):
            self.assertEqual(utils.get_instance_id(), "local")

        # Ensure that the second call uses the now-set global value
        # by changing what settings would tell it.
        with self.settings(RUNNING_IN_CLOUD=True):
            self.assertEqual(utils.get_instance_id(), "local")
Ejemplo n.º 2
0
    def test_get_instance_id_cloud(self, mock_get):
        """Test that a request is made and the global value is stored"""
        # Ensure utils.INSTANCE_ID hasn't been set yet in case the
        # order the tests are run in ever changes
        utils.INSTANCE_ID = None
        mock_get.return_value = Mock(ok=True)
        mock_get.return_value.text = "instance_id"

        with self.settings(RUNNING_IN_CLOUD=True):
            self.assertEqual(utils.get_instance_id(), "instance_id")

        # Ensure that the second call uses the now-set global value.
        # (By resetting the mocks, calling it again, and checking that
        # the values didn't need to be set again).
        mock_get.reset_mock()
        utils.get_instance_id()
        mock_get.assert_not_called()
Ejemplo n.º 3
0
def start_job(job_id: int) -> DownloaderJob:
    """Record in the database that this job is being started.

    Retrieves the job from the database and returns it after marking
    it as started.
    """
    logger.debug("Starting Downloader Job.", downloader_job=job_id)
    try:
        job = DownloaderJob.objects.get(id=job_id)
    except DownloaderJob.DoesNotExist:
        logger.error("Cannot find downloader job record.",
                     downloader_job=job_id)
        raise

    worker_id = get_instance_id()

    # This job should not have been started.
    if job.start_time is not None:
        logger.error("This downloader job has already been started!!!",
                     downloader_job=job.id)
        raise Exception(
            "downloaders.start_job called on a job that has already been started!"
        )

    # Set up the SIGTERM handler so we can appropriately handle being interrupted.
    # (`docker stop` uses SIGTERM, not SIGINT, but better to catch both.)
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)

    job.worker_id = worker_id
    job.worker_version = SYSTEM_VERSION
    job.start_time = timezone.now()
    job.save()

    needs_downloading = any(original_file.needs_downloading()
                            for original_file in job.original_files.all())

    if not needs_downloading:
        logger.error((
            "No files associated with this job need to be downloaded! Aborting!"
        ),
                     job_id=job.id)
        job.start_time = timezone.now()
        job.failure_reason = "Was told to redownload file(s) that are already downloaded!"
        job.success = False
        job.no_retry = True
        job.end_time = timezone.now()
        job.save()
        sys.exit(0)

    global CURRENT_JOB
    CURRENT_JOB = job

    return job
Ejemplo n.º 4
0
    def test_dharma(self):

        dlj1 = DownloaderJob()
        dlj1.accession_code = 'D1'
        dlj1.worker_id = get_instance_id()
        dlj1.start_time = datetime.datetime.now()
        dlj1.save()

        dlj2 = DownloaderJob()
        dlj2.accession_code = 'D2'
        dlj2.worker_id = get_instance_id()
        dlj2.start_time = datetime.datetime.now()
        dlj2.save()

        dlj3 = DownloaderJob()
        dlj3.accession_code = 'D3'
        dlj3.worker_id = get_instance_id()
        dlj3.save()

        original_file = OriginalFile()
        original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip"
        original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj3
        assoc.save()

        sample = Sample()
        sample.accession_code = 'Blahblahblah'
        sample.technology = "MICROARRAY"
        sample.manufacturer = "AFFYMETRIX"
        sample.has_raw = True
        sample.platform_accession_code = "hgu133a"
        sample.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample, original_file=original_file)

        exited = False
        try:
            utils.start_job(dlj3.id,
                            max_downloader_jobs_per_node=2,
                            force_harakiri=True)
        except SystemExit as e:
            # This is supposed to happen!
            self.assertTrue(True)
            exited = True
        except Exception as e:
            # This isn't!
            self.assertTrue(False)
        self.assertTrue(exited)

        exited = False
        try:
            utils.start_job(dlj3.id,
                            max_downloader_jobs_per_node=15,
                            force_harakiri=True)
        except SystemExit as e:
            # This is not supposed to happen!
            self.assertTrue(False)
            exited = True
        except Exception as e:
            # This is!
            self.assertTrue(True)
        self.assertFalse(exited)
Ejemplo n.º 5
0
def start_job(job_id: int,
              max_downloader_jobs_per_node=MAX_DOWNLOADER_JOBS_PER_NODE,
              force_harakiri=False) -> DownloaderJob:
    """Record in the database that this job is being started.

    Retrieves the job from the database and returns it after marking
    it as started.
    """
    logger.debug("Starting Downloader Job.", downloader_job=job_id)
    try:
        job = DownloaderJob.objects.get(id=job_id)
    except DownloaderJob.DoesNotExist:
        logger.error("Cannot find downloader job record.",
                     downloader_job=job_id)
        raise

    worker_id = get_instance_id()
    num_downloader_jobs_currently_running = DownloaderJob.objects.filter(
        worker_id=worker_id,
        start_time__isnull=False,
        end_time__isnull=True,
        success__isnull=True,
        retried=False).count()

    # Death and rebirth.
    # if settings.RUNNING_IN_CLOUD or force_harakiri:
    #     if num_downloader_jobs_currently_running >= int(max_downloader_jobs_per_node):
    #         # Wait for the death window
    #         while True:
    #             seconds = datetime.datetime.now().second
    #             # Mass harakiri happens every 15 seconds.
    #             if seconds % 15 == 0:
    #                 job.start_time = None
    #                 job.num_retries = job.num_retries - 1
    #                 job.failure_reason = "Killed by harakiri"
    #                 job.success = False
    #                 job.save()

    #                 # What is dead may never die!
    #                 sys.exit(0)

    # This job should not have been started.
    if job.start_time is not None:
        logger.error("This downloader job has already been started!!!",
                     downloader_job=job.id)
        raise Exception(
            "downloaders.start_job called on a job that has already been started!"
        )

    # Set up the SIGTERM handler so we can appropriately handle being interrupted.
    # (`docker stop` uses SIGTERM, not SIGINT.)
    # (however, Nomad sends an SIGINT so catch both.)
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)

    job.worker_id = worker_id
    job.worker_version = SYSTEM_VERSION
    job.start_time = timezone.now()
    job.save()

    needs_downloading = False
    for original_file in job.original_files.all():
        if original_file.needs_downloading():
            needs_downloading = True

    if not needs_downloading:
        logger.error((
            "No files associated with this job need to be downloaded! Aborting!"
        ),
                     job_id=job.id)
        job.start_time = timezone.now()
        job.failure_reason = "Was told to redownload file(s) that are already downloaded!"
        job.success = False
        job.no_retry = True
        job.end_time = timezone.now()
        job.save()
        sys.exit(0)

    global CURRENT_JOB
    CURRENT_JOB = job

    return job
Ejemplo n.º 6
0
import logging
import sys

from django.conf import settings

import daiquiri

from data_refinery_common.utils import get_env_variable_gracefully, get_instance_id

# Most of the formatting in this string is for the logging system. All
# that the call to format() does is replace the "{0}" in the string
# with the worker id.
FORMAT_STRING = ("%(asctime)s {0} %(name)s %(color)s%(levelname)s%(extras)s"
                 ": %(message)s%(color_stop)s").format(get_instance_id())
LOG_LEVEL = None


def unconfigure_root_logger():
    """Prevents the root logger from duplicating our messages.

    The root handler comes preconfigured with a handler. This causes
    all our logs to be logged twice, once with our cool handler and
    one that lacks all context. This function removes that stupid
    extra handler.
    """
    root_logger = logging.getLogger(None)
    # Remove all handlers
    for handler in list(root_logger.handlers):
        root_logger.removeHandler(handler)

Ejemplo n.º 7
0
def start_job(job_context: Dict):
    """A processor function to start jobs.

    Record in the database that this job is being started and
    retrieves the job's batches from the database and adds them to the
    dictionary passed in with the key 'batches'.
    """
    job = job_context["job"]

    # This job should not have been started.
    if job.start_time is not None and settings.RUNNING_IN_CLOUD:

        if job.success:
            failure_reason = "ProcessorJob has already completed succesfully - why are we here again? Bad Nomad!"
            logger.error(failure_reason,
                job_id=job.id
            )
            job_context["original_files"] = []
            job_context["computed_files"] = []
            job_context['abort'] = True
            # Will be saved by end_job.
            job_context['job'].failure_reason = failure_reason
            return job_context
        if job.success == False:
            failure_reason = "ProcessorJob has already completed with a fail - why are we here again? Bad Nomad!"
            logger.error(failure_reason,
                job_id=job.id
            )
            job_context["original_files"] = []
            job_context["computed_files"] = []
            job_context['abort'] = True
            # Will be saved by end_job.
            job_context['job'].failure_reason = failure_reason
            return job_context

        logger.error("This processor job has already been started!!!", processor_job=job.id)
        raise Exception("processors.start_job called on job %s that has already been started!" % str(job.id))

    original_file = job.original_files.first()
    if original_file and not original_file.needs_processing(job_context["job_id"]):
        failure_reason = ("Sample has a good computed file, it must have been processed, "
                          "so it doesn't need to be downloaded! Aborting!")
        logger.error(failure_reason,
                     job_id=job.id,
                     original_file=original_file
        )
        job_context["original_files"] = []
        job_context["computed_files"] = []
        job_context['abort'] = True
        # Will be saved by end_job.
        job_context['job'].failure_reason = failure_reason
        return job_context

    # Set up the SIGTERM handler so we can appropriately handle being interrupted.
    # (`docker stop` uses SIGTERM, not SIGINT.)
    # (however, Nomad sends an SIGINT so catch both.)
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)

    job.worker_id = get_instance_id()
    job.worker_version = SYSTEM_VERSION
    job.start_time = timezone.now()
    job.save()

    global CURRENT_JOB
    CURRENT_JOB = job

    logger.debug("Starting processor Job.", processor_job=job.id, pipeline=job.pipeline_applied)

    # Janitor jobs don't operate on file objects.
    # Tximport jobs don't need to download the original file, they
    # just need it to know what experiment to process.
    if job.pipeline_applied not in ["JANITOR", "TXIMPORT"]:
        # Some jobs take OriginalFiles, other take Datasets
        if job.pipeline_applied not in ["SMASHER", "QN_REFERENCE", "COMPENDIA"]:
            job_context = prepare_original_files(job_context)
            if not job_context.get("success", True):
                return job_context
        else:
            job_context = prepare_dataset(job_context)
            if not job_context.get("success", True):
                return job_context
    else:
        # Just in case
        job_context["original_files"] = []
        job_context["computed_files"] = []

    return job_context
Ejemplo n.º 8
0
def start_job(job_context: Dict):
    """A processor function to start jobs.

    Record in the database that this job is being started and
    retrieves the job's batches from the database and adds them to the
    dictionary passed in with the key 'batches'.
    """
    job = job_context["job"]

    original_file = job.original_files.first()
    if (
        not job.pipeline_applied == ProcessorPipeline.TXIMPORT.value
        and original_file
        and not original_file.needs_processing(job_context["job_id"])
    ):
        failure_reason = (
            "Sample has a good computed file, it must have been processed, "
            "so it doesn't need to be downloaded! Aborting!"
        )
        logger.error(failure_reason, job_id=job.id, original_file=original_file)
        job_context["original_files"] = []
        job_context["computed_files"] = []
        job_context["abort"] = True
        # Will be saved by end_job.
        job_context["job"].failure_reason = failure_reason
        return job_context

    # Set up the SIGTERM handler so we can appropriately handle being interrupted.
    # (`docker stop` uses SIGTERM, not SIGINT.)
    # (however, Nomad sends an SIGINT so catch both.)
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)

    # This job should not have been started, for some reason Nomad restarts some of our jobs
    # https://github.com/AlexsLemonade/refinebio/issues/1487
    if job.start_time is not None and settings.RUNNING_IN_CLOUD:
        # Let's just log the event and let the job run instead of failing
        # and also reset the endtime and failure reason, since those fields might have been set
        logger.warn(
            "ProcessorJob was restarted by Nomad. We do not know why this happened",
            processor_job=job.id,
            success=job.success,
            failure_reason=job.failure_reason,
            start_time=job.start_time,
            end_time=job.end_time,
        )
        job.end_time = None
        job.failure_reason = None

    job.worker_id = get_instance_id()
    job.worker_version = SYSTEM_VERSION
    job.start_time = timezone.now()
    job.save()

    global CURRENT_JOB
    CURRENT_JOB = job

    logger.debug("Starting processor Job.", processor_job=job.id, pipeline=job.pipeline_applied)

    # Janitor jobs don't operate on file objects.
    # Tximport jobs don't need to download the original file, they
    # just need it to know what experiment to process.
    if job.pipeline_applied not in [
        ProcessorPipeline.JANITOR.value,
        ProcessorPipeline.TXIMPORT.value,
    ]:
        # Some jobs take OriginalFiles, other take Datasets
        if ProcessorPipeline[job.pipeline_applied] not in SMASHER_JOB_TYPES:
            job_context = prepare_original_files(job_context)
            if not job_context.get("success", True):
                return job_context
        else:
            job_context = prepare_dataset(job_context)
            if not job_context.get("success", True):
                return job_context
    else:
        # Just in case
        job_context["original_files"] = []
        job_context["computed_files"] = []

    return job_context
Ejemplo n.º 9
0
def start_job(job_context: Dict):
    """A processor function to start jobs.

    Record in the database that this job is being started and
    retrieves the job's batches from the database and adds them to the
    dictionary passed in with the key 'batches'.
    """
    job = job_context["job"]

    # This job should not have been started.
    if job.start_time is not None and settings.RUNNING_IN_CLOUD:

        if job.success:
            logger.error(
                "ProcessorJob has already completed succesfully - why are we here again? Bad Nomad!",
                job_id=job.id)
            job_context["original_files"] = []
            job_context["computed_files"] = []
            job_context['abort'] = True
            return job_context
        if job.success == False:
            logger.error(
                "ProcessorJob has already completed with a fail - why are we here again? Bad Nomad!",
                job_id=job.id)
            job_context["original_files"] = []
            job_context["computed_files"] = []
            job_context['abort'] = True
            return job_context

        logger.error("This processor job has already been started!!!",
                     processor_job=job.id)
        raise Exception(
            "processors.start_job called on job %s that has already been started!"
            % str(job.id))

    # Set up the SIGTERM handler so we can appropriately handle being interrupted.
    # (`docker stop` uses SIGTERM, not SIGINT.)
    # (however, Nomad sends an SIGINT so catch both.)
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)

    job.worker_id = get_instance_id()
    job.worker_version = SYSTEM_VERSION
    job.start_time = timezone.now()
    job.save()

    global CURRENT_JOB
    CURRENT_JOB = job

    logger.debug("Starting processor Job.",
                 processor_job=job.id,
                 pipeline=job.pipeline_applied)

    # Janitors have no requirement
    if job.pipeline_applied not in ["JANITOR"]:
        # Some jobs take OriginalFiles, other take Datasets
        if job.pipeline_applied not in [
                "SMASHER", "QN_REFERENCE", "COMPENDIA"
        ]:
            job_context = prepare_original_files(job_context)
            if not job_context.get("success", True):
                return job_context
        else:
            job_context = prepare_dataset(job_context)
            if not job_context.get("success", True):
                return job_context
    else:
        # Just in case
        job_context["original_files"] = []
        job_context["computed_files"] = []

    return job_context