Exemple #1
0
def handle_survey_jobs(jobs: List[SurveyJob]) -> None:
    """For each job in jobs, either retry it or log it."""

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    # Maximum number of total jobs running at a time.
    # We do this now rather than import time for testing purposes.
    MAX_TOTAL_JOBS = int(
        get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS))
    len_all_jobs = len(nomad_client.jobs.get_jobs())
    if len_all_jobs >= MAX_TOTAL_JOBS:
        logger.info("Not requeuing job until we're running fewer jobs.")
        return False

    jobs_dispatched = 0
    for count, job in enumerate(jobs):
        if job.num_retries < MAX_NUM_RETRIES:
            requeue_survey_job(job)
            jobs_dispatched = jobs_dispatched + 1
        else:
            handle_repeated_failure(job)

        if (count % 100) == 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

        if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS:
            logger.info(
                "We hit the maximum total jobs ceiling, so we're not handling any more survey jobs now."
            )
            return False

    return True
Exemple #2
0
def retry_hung_downloader_jobs() -> None:
    """Retry downloader jobs that were started but never finished."""
    potentially_hung_jobs = DownloaderJob.objects.filter(
        success=None,
        retried=False,
        end_time=None,
        start_time__isnull=False,
        no_retry=False).prefetch_related("original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    hung_jobs = []
    for job in potentially_hung_jobs:
        try:
            job_status = nomad_client.job.get_job(job.nomad_job_id)["Status"]
            if job_status != "running":
                # Make sure it didn't finish since our original query.
                job.refresh_from_db()
                if job.end_time is None:
                    hung_jobs.append(job)
        except URLNotFoundNomadException:
            hung_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Downloader Job.",
                             downloader_job=job.id)

    if hung_jobs:
        logger.info(
            "Handling hung (started-but-never-finished) downloader jobs!",
            jobs_count=len(hung_jobs))
        handle_downloader_jobs(hung_jobs)
Exemple #3
0
def retry_hung_processor_jobs() -> None:
    """Retry processor jobs that were started but never finished.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    potentially_hung_jobs = ProcessorJob.objects.filter(
        success=None,
        retried=False,
        end_time=None,
        start_time__isnull=False,
        no_retry=False,
        volume_index__in=active_volumes).exclude(
            pipeline_applied="JANITOR").prefetch_related(
                "original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    hung_jobs = []
    for job in potentially_hung_jobs:
        try:
            job_status = nomad_client.job.get_job(job.nomad_job_id)["Status"]
            if job_status != "running":
                # Make sure it didn't finish since our original query.
                job.refresh_from_db()
                if job.end_time is None:
                    hung_jobs.append(job)
        except URLNotFoundNomadException:
            hung_jobs.append(job)
        except TypeError:
            # Almost certainly a python-nomad issue:
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/job.py", line 63, in get_job
            #   return self.request(id, method="get").json()
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 74, in request
            #   endpoint = self._endpoint_builder(self.ENDPOINT, *args)
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 28, in _endpoint_builder
            #   u = "/".join(args)
            # TypeError: sequence item 1: expected str instance, NoneType found
            logger.info("Couldn't query Nomad about Processor Job.",
                        processor_job=job.id)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Processor Job.",
                             processor_job=job.id)

    if hung_jobs:
        logger.info(
            "Handling hung (started-but-never-finished) processor jobs!",
            len_jobs=len(hung_jobs))
        handle_processor_jobs(hung_jobs)
    def handle(self, *args, **options):
        """Requeues all unprocessed RNA-Seq samples for an organism.
        """
        if options["organism_name"] is None:
            logger.error("You must specify an organism-name.")
            sys.exit(1)
        else:
            organism_name = options["organism_name"]

        organism = Organism.objects.get(name=organism_name)

        prioritized_job_list = build_prioritized_jobs_list(organism)

        if not len(prioritized_job_list):
            logger.info(
                "Found no samples that need to be processed. I guess I'm done!"
            )
            sys.exit(0)

        logger.info(
            "Found %d samples that need to be processed. Beginning to queue jobs!",
            len(prioritized_job_list),
        )

        nomad_host = get_env_variable("NOMAD_HOST")
        nomad_port = get_env_variable("NOMAD_PORT", "4646")
        nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)

        while len(prioritized_job_list) > 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

            num_short_from_max = MAX_JOBS_FOR_THIS_MODE - len_all_jobs
            if num_short_from_max > 0:
                # We don't want these jobs to sit in our queue because
                # the volume we assigned isn't available, so only use
                # active volumes. Also in order to spread them around
                # do so randomly. We don't want to hammer Nomad to
                # get the active volumes though, so just do it once
                # per 5 minute loop.
                volume_index = random.choice(list(get_active_volumes()))
                for i in range(num_short_from_max):
                    if len(prioritized_job_list) > 0:
                        requeue_job(prioritized_job_list.pop(0), volume_index)

            # Wait 5 minutes in between queuing additional work to
            # give it time to actually get done.
            if len(prioritized_job_list) > 0:
                logger.info("Sleeping for 5 minutes while jobs get done.")
                time.sleep(300)

        logger.info(
            "Successfully requeued all jobs for unprocessed %s samples.",
            organism_name)
Exemple #5
0
    def kill_nomad_job(self) -> bool:
        if not self.nomad_job_id:
            return False

        try:
            nomad_host = get_env_variable("NOMAD_HOST")
            nomad_port = get_env_variable("NOMAD_PORT", "4646")
            nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
            nomad_client.job.deregister_job(self.nomad_job_id)
        except nomad.api.exceptions.BaseNomadException:
            return False

        return True
Exemple #6
0
def retry_lost_survey_jobs() -> None:
    """Retry survey jobs which never even got started for too long."""
    potentially_lost_jobs = SurveyJob.objects.filter(
        success=None,
        retried=False,
        start_time=None,
        end_time=None,
        no_retry=False).order_by('pk')

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    lost_jobs = []

    for job in potentially_lost_jobs:
        try:
            # Surveyor jobs didn't always have nomad_job_ids. If they
            # don't have one then by this point they've definitely died.
            if job.nomad_job_id:
                job_status = nomad_client.job.get_job(
                    job.nomad_job_id)["Status"]
            else:
                job_status = "absent"

            # If the job is still pending, then it makes sense that it
            # hasn't started and if it's running then it may not have
            # been able to mark the job record as started yet.
            if job_status != "pending" and job_status != "running":
                logger.debug((
                    "Determined that a survey job needs to be requeued because its"
                    " Nomad Job's status is: %s."),
                             job_status,
                             job_id=job.id)
                lost_jobs.append(job)
        except URLNotFoundNomadException:
            logger.debug(
                ("Determined that a survey job needs to be requeued because "
                 "querying for its Nomad job failed."),
                job_id=job.id)
            lost_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Processor Job.",
                             survey_job=job.id)

    if lost_jobs:
        logger.info("Handling lost (never-started) survey jobs!",
                    len_jobs=len(lost_jobs))
        handle_survey_jobs(lost_jobs)
Exemple #7
0
def send_job(job_type: Enum, job_id: int) -> None:
    """Queues a worker job by sending a Nomad Job dispatch message.

    job_type must be a valid Enum for ProcessorPipelines or
    Downloaders as defined in data_refinery_common.job_lookup.
    job_id must correspond to an existing ProcessorJob or
    DownloaderJob record.
    """
    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_client = nomad.Nomad(nomad_host, timeout=5)

    # Once I have every job specced out with its own Nomad job, this
    # code can change and the meta won't need "JOB_NAME" in it because
    # the just specifying the nomad_job to dispatch will be enough.
    if job_type in list(ProcessorPipeline):
        nomad_job = NOMAD_PROCESSOR_JOB
    elif job_type in list(Downloaders):
        nomad_job = NOMAD_DOWNLOADER_JOB
    else:
        raise ValueError("Invalid job_type.")

    logger.info("Queuing %s nomad job to run DR job %s with id %d.",
                nomad_job,
                job_type.value,
                job_id)
    nomad_client.job.dispatch_job(nomad_job, meta={"JOB_NAME": job_type.value,
                                                   "JOB_ID": str(job_id)})
Exemple #8
0
def get_latest_organism_index(organism):
    # Salmon version gets saved as what salmon outputs, which includes this prefix.
    current_salmon_version = "salmon " + get_env_variable(
        "SALMON_VERSION", "0.13.1")
    return (OrganismIndex.objects.filter(
        salmon_version=current_salmon_version,
        organism=organism).order_by("-created_at").first())
Exemple #9
0
    def handle(self, *args, **options):
        """Main function for this command.

        Basically does what is described at the top of this file.
        """
        # Create working dir
        LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR",
                                          "/home/user/data_store")
        work_dir = LOCAL_ROOT_DIR + "/affy_correction/"
        os.makedirs(work_dir, exist_ok=True)
        for sample in Sample.objects.filter(technology="RNA-SEQ",
                                            source_database="GEO"):
            for original_file in sample.original_files.all():
                if original_file.is_affy_data():
                    input_file_path = work_dir + original_file.source_filename
                    download_success = _download_file(original_file.source_url,
                                                      input_file_path)

                    if download_success:
                        try:
                            brainarray_package = _determine_brainarray_package(
                                input_file_path)

                            if brainarray_package:
                                logger.info(
                                    "Determined the package for sample %d is: "
                                    + brainarray_package,
                                    sample.id,
                                )
                                # If we've detected the platform using affy, then this
                                # is the best source of truth we'll be able to get, so
                                # update the sample to match it.
                                platform_name = get_readable_affymetrix_names(
                                )[brainarray_package]

                                sample.platform_accession_code = brainarray_package
                                sample.platform_name = platform_name
                        except:
                            logger.exception(
                                "Failed to detect platform from downloaded file %s.",
                                input_file_path,
                            )

                    # Regardless of whether we could detect the
                    # platform successfully or not, we definitely know
                    # it's an Affymetrix Microarray because that's the
                    # only one that makes .CEL files.
                    sample.technology = "MICROARRAY"
                    sample.manufacturer = "AFFYMETRIX"
                    sample.save()

                    # If there's other original files associated with
                    # this sample, we don't need them because we
                    # already corrected the platform.
                    break

        # Cleanup after ourselves:
        shutil.rmtree(work_dir)
Exemple #10
0
def handle_processor_jobs(jobs: List[ProcessorJob]) -> None:
    """For each job in jobs, either retry it or log it."""

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    # Maximum number of total jobs running at a time.
    # We do this now rather than import time for testing purposes.
    MAX_TOTAL_JOBS = int(
        get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS))
    len_all_jobs = len(nomad_client.jobs.get_jobs())
    if len_all_jobs >= MAX_TOTAL_JOBS:
        logger.info("Not requeuing job until we're running fewer jobs.")
        return False

    # We want zebrafish data first, then hgu133plus2, then data
    # related to pediatric cancer, then to finish salmon experiments
    # that are close to completion.
    # Each function moves the jobs it prioritizes to the front of the
    # list, so apply them in backwards order.
    # jobs = prioritize_salmon_jobs(jobs)
    # jobs = prioritize_jobs_by_accession(jobs, PEDIATRIC_ACCESSION_LIST)
    # jobs = prioritize_jobs_by_accession(jobs, HGU133PLUS2_ACCESSION_LIST)
    # jobs = prioritize_zebrafish_jobs(jobs)

    jobs_dispatched = 0
    for count, job in enumerate(jobs):
        if job.num_retries < MAX_NUM_RETRIES:
            requeue_processor_job(job)
            jobs_dispatched = jobs_dispatched + 1
        else:
            handle_repeated_failure(job)

        if (count % 100) == 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

        if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS:
            logger.info(
                "We hit the maximum total jobs ceiling, so we're not handling any more processor jobs now."
            )
            return False

    return True
Exemple #11
0
    def test_survey(self):
        """Survey the given sample"""

        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            survey_job = surveyor.survey_experiment(
                get_env_variable("ACCESSION"), get_env_variable("SURVEYOR"))

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertGreater(downloader_jobs.count(), 0)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            for downloader_job in downloader_jobs:
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time)
                self.assertTrue(downloader_job.success)

            processor_jobs = ProcessorJob.objects.all().exclude(
                abort=True)  # exclude aborted processor jobs
            self.assertGreater(processor_jobs.count(), 0)

            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            start_time = timezone.now()
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                if not processor_job.success:
                    logger.error(processor_job.failure_reason)
                self.assertTrue(processor_job.success)
Exemple #12
0
def retry_hung_survey_jobs() -> None:
    """Retry survey jobs that were started but never finished."""
    potentially_hung_jobs = SurveyJob.objects.filter(
        success=None,
        retried=False,
        end_time=None,
        start_time__isnull=False,
        no_retry=False).order_by('pk')

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    hung_jobs = []
    for job in potentially_hung_jobs:
        try:
            # Surveyor jobs didn't always have nomad_job_ids. If they
            # don't have one then by this point they've definitely died.
            if job.nomad_job_id:
                job_status = nomad_client.job.get_job(
                    job.nomad_job_id)["Status"]
            else:
                job_status = "absent"

            if job_status != "running":
                # Make sure it didn't finish since our original query.
                job.refresh_from_db()
                if job.end_time is None:
                    hung_jobs.append(job)
        except URLNotFoundNomadException:
            hung_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about SurveyJob Job.",
                             survey_job=job.id)

    if hung_jobs:
        logger.info("Handling hung (started-but-never-finished) survey jobs!",
                    len_jobs=len(hung_jobs))
        handle_survey_jobs(hung_jobs)
Exemple #13
0
def get_quant_results_for_experiment(experiment: Experiment,
                                     filter_old_versions=True):
    """Returns a set of salmon quant results from `experiment`."""
    # Subquery to calculate quant results
    # https://docs.djangoproject.com/en/2.2/ref/models/expressions/#subquery-expressions
    all_results = ComputationalResult.objects.filter(
        sample__in=experiment.samples.all())

    if filter_old_versions:
        # Salmon version gets saved as what salmon outputs, which includes this prefix.
        current_salmon_version = "salmon " + get_env_variable(
            "SALMON_VERSION", "0.13.1")
        organisms = experiment.organisms.all()
        organism_indices = OrganismIndex.objects.filter(
            salmon_version=current_salmon_version, organism__in=organisms)
        all_results = all_results.filter(
            organism_index__id__in=organism_indices.values("id"))

    all_results = all_results.prefetch_related("computedfile_set").filter(
        computedfile__s3_bucket__isnull=False,
        computedfile__s3_key__isnull=False)

    def get_sample_id_set(result):
        return {sample.id for sample in result.samples.all()}

    latest_results = set()
    for k, group in groupby(sorted(all_results, key=get_sample_id_set),
                            get_sample_id_set):
        latest_result = None
        for result in group:
            if not latest_result:
                latest_result = result
            else:
                if result.created_at > latest_result.created_at:
                    latest_result = result

        latest_results.add(latest_result)

    return latest_results
Exemple #14
0
def get_quant_results_for_experiment(experiment: Experiment, filter_old_versions=True):
    """Returns a queryset of salmon quant results from `experiment`."""
    # Subquery to calculate quant results
    # https://docs.djangoproject.com/en/2.2/ref/models/expressions/#subquery-expressions

    # Salmon version gets saved as what salmon outputs, which includes this prefix.
    current_salmon_version = "salmon " + get_env_variable("SALMON_VERSION", "0.13.1")

    if filter_old_versions:
        eligible_results = ComputationalResult.objects.prefetch_related("organism_index").filter(
            organism_index__salmon_version=current_salmon_version
        )
    else:
        eligible_results = ComputationalResult.objects.all()

    # A result is only eligible to be used if it actually got uploaded.
    eligible_results = eligible_results.select_related("computedfile").filter(
        computedfile__s3_bucket__isnull=False, computedfile__s3_key__isnull=False
    )

    # Calculate the computational results sorted that are associated with a given sample (
    # referenced from the top query)
    newest_computational_results = eligible_results.filter(
        samples=OuterRef("id"), processor__name=ProcessorEnum.SALMON_QUANT.value["name"],
    ).order_by("-created_at")

    # Annotate each sample in the experiment with the id of the most recent computational result
    computational_results_ids = (
        experiment.samples.all()
        .annotate(
            latest_computational_result_id=Subquery(newest_computational_results.values("id")[:1])
        )
        .filter(latest_computational_result_id__isnull=False)
        .values_list("latest_computational_result_id", flat=True)
    )

    # return the computational results that match those ids
    return ComputationalResult.objects.all().filter(id__in=computational_results_ids)
Exemple #15
0
    def test_tximport(self):
        self.assertEqual(Processor.objects.count(), 0)  # No processor yet

        proc_key = "TXIMPORT"
        tximport_processor = utils.find_processor(proc_key)
        self.assertEqual(Processor.objects.count(), 1)  # New processor created

        # Validate some information of the new processor
        self.assertEqual(tximport_processor.name,
                         utils.ProcessorEnum[proc_key].value['name'])
        self.assertEqual(tximport_processor.version,
                         get_env_variable("SYSTEM_VERSION"))
        self.assertEqual(tximport_processor.docker_image,
                         utils.ProcessorEnum[proc_key].value['docker_img'])
        self.assertEqual(tximport_processor.environment['os_distribution'],
                         utils.get_os_distro())

        os_pkg_name = 'r-base'
        self.assertEqual(tximport_processor.environment['os_pkg'][os_pkg_name],
                         utils.get_os_pkgs([os_pkg_name])[os_pkg_name])

        pip_pkg_name = 'data-refinery-common'
        self.assertEqual(tximport_processor.environment['python'][pip_pkg_name],
                         utils.get_pip_pkgs([pip_pkg_name])[pip_pkg_name])

        r_pkg_names = ['Bioconductor', 'tximport']
        r_pkg_info = utils.get_r_pkgs(r_pkg_names)
        for r_pkg in  r_pkg_names:
            self.assertEqual(tximport_processor.environment['R'][r_pkg],
                             r_pkg_info[r_pkg])

        # Confirm that there is only one processor in one runtime environment
        for i in range(3):
            proc2 = utils.find_processor(proc_key)
            self.assertEqual(Processor.objects.count(), 1)  # No new processor
            self.assertEqual(tximport_processor, proc2)     # Same processor instance
import rpy2.robjects as ro
from rpy2.rinterface import RRuntimeError

from data_refinery_common.job_lookup import PipelineEnum
from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models import (
    ComputationalResult,
    ComputedFile,
    Pipeline,
    SampleComputedFileAssociation,
    SampleResultAssociation,
)
from data_refinery_common.utils import get_env_variable
from data_refinery_workers.processors import utils

S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery")

logger = get_and_configure_logger(__name__)


def _prepare_files(job_context: Dict) -> Dict:
    """Populate our job_context with appropriate inputs and outputs

    Also adds the keys "input_file_path" and "output_file_path" to
    job_context so everything is prepared for processing.
    """
    original_file = job_context["original_files"][0]
    job_context["input_file_path"] = original_file.absolute_file_path
    # Turns /home/user/data_store/E-GEOD-8607/raw/foo.txt into /home/user/data_store/E-GEOD-8607/processed/foo.cel
    pre_part = original_file.absolute_file_path.split("/")[:-2]
    end_part = original_file.absolute_file_path.split("/")[-1]
Exemple #17
0
    OrganismIndex,
    OriginalFile,
    Pipeline,
    Processor,
    ProcessorJob,
)
from data_refinery_common.utils import get_env_variable, get_env_variable_gracefully
from data_refinery_workers.processors import utils


logger = get_and_configure_logger(__name__)
JOB_DIR_PREFIX = "processor_job_"
GENE_TO_TRANSCRIPT_TEMPLATE = "{gene_id}\t{transcript_id}\n"
GENE_TYPE_COLUMN = 2
S3_TRANSCRIPTOME_INDEX_BUCKET_NAME = get_env_variable_gracefully("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME", False)
LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store")
# Removes each occurrance of ; and "
IDS_CLEANUP_TABLE = str.maketrans({";": None, "\"": None})


def _compute_paths(job_context: Dict) -> str:
    """Computes the paths for all the directories used/created by this processor.

    Also computes a couple other path-based properties and adds them to the job_context.
    """
    # All files for the job are in the same directory.
    first_file_path = job_context["original_files"][0].absolute_file_path
    job_context["base_file_path"] = '/'.join(first_file_path.split('/')[:-1])
    job_context["work_dir"] = job_context["base_file_path"] + '/' + job_context["length"].upper() + '/' + \
                              JOB_DIR_PREFIX + str(job_context["job_id"]) + "/"
    try:
Exemple #18
0
"""

import os
import sys

from data_refinery_common.utils import get_env_variable, get_env_variable_gracefully

# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/

# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = get_env_variable("DJANGO_SECRET_KEY")

# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = get_env_variable("DJANGO_DEBUG") == "True"

ALLOWED_HOSTS = []


# Application definition

INSTALLED_APPS = [
    "django.contrib.admin",
    "django.contrib.auth",
    "django.contrib.contenttypes",
    "django.contrib.sessions",
    "django.contrib.messages",
Exemple #19
0
from django.db import models
from django.utils import timezone

import requests
from computedfields.models import ComputedFieldsModel, computed

from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models.compendium_result import CompendiumResult
from data_refinery_common.utils import get_env_variable

logger = get_and_configure_logger(__name__)

NCBI_ROOT_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
NCBI_API_KEY = get_env_variable(
    "NCBI_API_KEY", "3a1f8d818b0aa05d1aa3c334fa2cc9a17e09"
)  # This is only used by eUtils and for organisms that aren't cached yet - it's harmless to share.
ESEARCH_URL = NCBI_ROOT_URL + "esearch.fcgi"
EFETCH_URL = NCBI_ROOT_URL + "efetch.fcgi"
TAXONOMY_DATABASE = "taxonomy"


class UnscientificNameError(Exception):
    pass


class InvalidNCBITaxonomyId(Exception):
    pass


class UnknownOrganismId(Exception):
Exemple #20
0
def send_job(job_type: Enum, job, is_dispatch=False) -> bool:
    """Queues a worker job by sending a Nomad Job dispatch message.

    job_type must be a valid Enum for ProcessorPipelines or
    Downloaders as defined in data_refinery_common.job_lookup.
    job must be an existing ProcessorJob or DownloaderJob record.

    Returns True if the job was successfully dispatch, return False otherwise.
    """
    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = nomad.Nomad(nomad_host, port=int(nomad_port), timeout=30)

    is_processor = True
    if (job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG
            or job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT):
        nomad_job = NOMAD_TRANSCRIPTOME_JOB
    elif job_type is ProcessorPipeline.SALMON or job_type is ProcessorPipeline.TXIMPORT:
        # Tximport uses the same job specification as Salmon.
        nomad_job = ProcessorPipeline.SALMON.value
    elif job_type is ProcessorPipeline.AFFY_TO_PCL:
        nomad_job = ProcessorPipeline.AFFY_TO_PCL.value
    elif job_type is ProcessorPipeline.NO_OP:
        nomad_job = ProcessorPipeline.NO_OP.value
    elif job_type is ProcessorPipeline.ILLUMINA_TO_PCL:
        nomad_job = ProcessorPipeline.ILLUMINA_TO_PCL.value
    elif job_type is ProcessorPipeline.SMASHER:
        nomad_job = ProcessorPipeline.SMASHER.value
    elif job_type is ProcessorPipeline.JANITOR:
        nomad_job = ProcessorPipeline.JANITOR.value
    elif job_type is ProcessorPipeline.QN_REFERENCE:
        nomad_job = ProcessorPipeline.QN_REFERENCE.value
    elif job_type is ProcessorPipeline.CREATE_COMPENDIA:
        nomad_job = ProcessorPipeline.CREATE_COMPENDIA.value
    elif job_type is ProcessorPipeline.CREATE_QUANTPENDIA:
        nomad_job = ProcessorPipeline.CREATE_QUANTPENDIA.value
    elif job_type is ProcessorPipeline.AGILENT_TWOCOLOR_TO_PCL:
        # Agilent twocolor uses the same job specification as Affy.
        nomad_job = ProcessorPipeline.AFFY_TO_PCL.value
    elif job_type in list(Downloaders):
        nomad_job = NOMAD_DOWNLOADER_JOB
        is_processor = False
    elif job_type in list(SurveyJobTypes):
        nomad_job = job_type.value
        is_processor = False
    elif job_type is Downloaders.NONE:
        logger.warn("Not queuing %s job.", job_type, job_id=job_id)
        raise ValueError(
            NONE_JOB_ERROR_TEMPLATE.format(job_type.value, "Downloader",
                                           job_id))
    elif job_type is ProcessorPipeline.NONE:
        logger.warn("Not queuing %s job.", job_type, job_id=job_id)
        raise ValueError(
            NONE_JOB_ERROR_TEMPLATE.format(job_type.value, "Processor",
                                           job_id))
    else:
        raise ValueError("Invalid job_type: {}".format(job_type.value))

    logger.debug("Queuing %s nomad job to run job %s with id %d.", nomad_job,
                 job_type.value, job.id)

    # We only want to dispatch processor jobs directly.
    # Everything else will be handled by the Foreman, which will increment the retry counter.
    if is_processor or is_dispatch or (not settings.RUNNING_IN_CLOUD):

        # Smasher doesn't need to be on a specific instance since it will
        # download all the data to its instance anyway.
        if isinstance(job, ProcessorJob) and job_type not in SMASHER_JOB_TYPES:
            # Make sure this job goes to the correct EBS resource.
            # If this is being dispatched for the first time, make sure that
            # we store the currently attached index.
            # If this is being dispatched by the Foreman, it should already
            # have an attached volume index, so use that.
            if job.volume_index is None:
                job.volume_index = get_volume_index()
                job.save()
            nomad_job = nomad_job + "_" + job.volume_index + "_" + str(
                job.ram_amount)
        elif isinstance(job, SurveyJob):
            nomad_job = nomad_job + "_" + str(job.ram_amount)
        elif isinstance(job, DownloaderJob):
            volume_index = job.volume_index if settings.RUNNING_IN_CLOUD else "0"
            nomad_job = nomad_job + "_" + volume_index + "_" + str(
                job.ram_amount)

        try:
            nomad_response = nomad_client.job.dispatch_job(nomad_job,
                                                           meta={
                                                               "JOB_NAME":
                                                               job_type.value,
                                                               "JOB_ID":
                                                               str(job.id)
                                                           })
            job.nomad_job_id = nomad_response["DispatchedJobID"]
            job.save()
            return True
        except URLNotFoundNomadException:
            logger.info(
                "Dispatching Nomad job of type %s for job spec %s to host %s and port %s failed.",
                job_type,
                nomad_job,
                nomad_host,
                nomad_port,
                job=str(job.id),
            )
            raise
        except Exception as e:
            logger.info(
                "Unable to Dispatch Nomad Job.",
                job_name=job_type.value,
                job_id=str(job.id),
                reason=str(e),
            )
            raise
    else:
        job.num_retries = job.num_retries - 1
        job.save()
    return True
Exemple #21
0
from django.utils import timezone

import boto3
import pandas as pd
import psutil
import requests
from botocore.exceptions import ClientError
from sklearn import preprocessing

from data_refinery_common.enums import PipelineEnum
from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models import ComputedFile, Pipeline
from data_refinery_common.utils import calculate_file_size, calculate_sha1, get_env_variable
from data_refinery_workers.processors import smashing_utils, utils

RESULTS_BUCKET = get_env_variable("S3_RESULTS_BUCKET_NAME",
                                  "refinebio-results-bucket")
S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery")
AWS_REGION = get_env_variable(
    "AWS_REGION",
    "us-east-1")  # Default to us-east-1 if the region variable can't be found
BODY_HTML = (Path("data_refinery_workers/processors/smasher_email.min.html").
             read_text().replace("\n", ""))
BODY_ERROR_HTML = (
    Path("data_refinery_workers/processors/smasher_email_error.min.html"
         ).read_text().replace("\n", ""))
BYTES_IN_GB = 1024 * 1024 * 1024
logger = get_and_configure_logger(__name__)
### DEBUG ###
logger.setLevel(logging.getLevelName("DEBUG"))

PROCESS_POOL_SIZE = max(1, int(psutil.cpu_count() / 2 - 1))
    def test_all_endpoints(self):
        response = self.client.get(
            reverse("experiments", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response["X-Source-Revision"],
                         get_env_variable("SYSTEM_VERSION"))

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"ids": str(self.sample.id) + ",1000"},
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"accession_codes": str(self.sample.accession_code) + ",1000"},
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("organisms", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("organisms", kwargs={"version": API_VERSION}) +
            "HOMO_SAPIENS/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("platforms", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("institutions", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("survey_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("downloader_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        # Don't know the best way to deal with this, but since the other tests in different files
        # create objects which are then deleted, the new objects from these tests will have different
        # IDs. In this case, since this file is ran first, the IDs are 1, but this may be a problem
        # in the future.
        response = self.client.get(
            reverse("downloader_jobs", kwargs={"version": API_VERSION}) +
            "1/"  # change back
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("processor_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("processor_jobs", kwargs={"version": API_VERSION}) + "1/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("stats", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("results", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("results", kwargs={"version": API_VERSION}) + "1/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("schema_redoc", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION}) +
            "?organism__name=DANIO_RERIO")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION}) +
            "?result_id=1")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("search", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("create_dataset", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code,
                         status.HTTP_405_METHOD_NOT_ALLOWED)
Exemple #23
0
import signal
import sys

from django.utils import timezone

from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models import DownloaderJob, DownloaderJobOriginalFileAssociation
from data_refinery_common.utils import get_env_variable, get_instance_id

logger = get_and_configure_logger(__name__)
# Let this fail if SYSTEM_VERSION is unset.
SYSTEM_VERSION = get_env_variable("SYSTEM_VERSION")

CURRENT_JOB = None


def signal_handler(sig, frame):
    """Signal Handler, works for both SIGTERM and SIGINT"""
    global CURRENT_JOB
    if CURRENT_JOB:
        CURRENT_JOB.success = False
        CURRENT_JOB.end_time = timezone.now()
        CURRENT_JOB.num_retries = CURRENT_JOB.num_retries - 1
        CURRENT_JOB.failure_reason = "Interruped by SIGTERM/SIGINT: " + str(sig)
        CURRENT_JOB.save()

    sys.exit(0)


def start_job(job_id: int) -> DownloaderJob:
    """Record in the database that this job is being started.
Exemple #24
0
    Processor,
    ProcessorJob,
    ProcessorJobDatasetAssociation,
    ProcessorJobOriginalFileAssociation,
    Sample,
)
from data_refinery_common.utils import (
    get_env_variable,
    get_env_variable_gracefully,
    get_instance_id,
)


logger = get_and_configure_logger(__name__)
# Let this fail if SYSTEM_VERSION is unset.
SYSTEM_VERSION = get_env_variable("SYSTEM_VERSION")
S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery")
DIRNAME = os.path.dirname(os.path.abspath(__file__))
CURRENT_JOB = None


def signal_handler(sig, frame):
    """Signal Handler, works for both SIGTERM and SIGINT"""
    global CURRENT_JOB
    if not CURRENT_JOB:
        sys.exit(0)
    else:
        CURRENT_JOB.start_time = None
        CURRENT_JOB.num_retries = CURRENT_JOB.num_retries - 1
        CURRENT_JOB.failure_reason = "Caught either a SIGTERM or SIGINT signal."
        CURRENT_JOB.success = False
Exemple #25
0
from data_refinery_common.enums import PipelineEnum
from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models import (
    ComputationalResult,
    ComputedFile,
    Pipeline,
    SampleAnnotation,
    SampleComputedFileAssociation,
    SampleResultAssociation,
)
from data_refinery_common.utils import get_env_variable, get_internal_microarray_accession
from data_refinery_workers.processors import utils

logger = get_and_configure_logger(__name__)
LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store")
S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery")


def _prepare_files(job_context: Dict) -> Dict:
    """A processor which takes externally-processed sample data and makes it smashable.
    """
    try:
        original_file = job_context["original_files"][0]
        sample0 = job_context["samples"][0]
        if sample0.manufacturer == "ILLUMINA":
            job_context["is_illumina"] = True
        else:
            job_context["is_illumina"] = False

        # All files for the job are in the same directory.
    def test_all_endpoints(self):
        response = self.client.get(reverse("experiments", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response["X-Source-Revision"], get_env_variable("SYSTEM_VERSION"))
        cache.clear()

        response = self.client.get(reverse("samples", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"ids": str(self.sample.id) + ",1000"},
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"accession_codes": str(self.sample.accession_code) + ",1000"},
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("organisms", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(
            reverse("organisms", kwargs={"version": API_VERSION}) + "HOMO_SAPIENS/"
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("platforms", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("institutions", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("survey_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertFalse(response.data["results"][0]["is_queued"])
        cache.clear()

        response = self.client.get(
            reverse("survey_jobs", kwargs={"version": API_VERSION}) + "1/"  # change back
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertFalse(response.data["is_queued"])
        cache.clear()

        response = self.client.get(reverse("downloader_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertFalse(response.data["results"][0]["is_queued"])
        cache.clear()

        # Don't know the best way to deal with this, but since the
        # other tests in different files create objects which are then
        # deleted, the new objects from these tests will have
        # different IDs. In this case, since this file is ran first,
        # the IDs are 1, but this may be a problem in the future.
        response = self.client.get(
            reverse("downloader_jobs", kwargs={"version": API_VERSION}) + "1/"  # change back
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertFalse(response.data["is_queued"])
        cache.clear()

        response = self.client.get(reverse("processor_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertFalse(response.data["results"][0]["is_queued"])
        cache.clear()

        response = self.client.get(
            reverse("processor_jobs", kwargs={"version": API_VERSION}) + "1/"
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertFalse(response.data["is_queued"])
        cache.clear()

        response = self.client.get(reverse("stats", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("results", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("results", kwargs={"version": API_VERSION}) + "1/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("schema_redoc", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION})
            + "?organism__name=DANIO_RERIO"
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION}) + "?result_id=1"
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(reverse("search", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        cache.clear()

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION})
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(reverse("create_dataset", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED)

        response = self.client.get(reverse("samples", kwargs={"version": API_VERSION}) + "?foo=bar")
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
        self.assertListEqual(response.json()["details"], ["foo"])

        # Tenth call since reset_cache() should be throttled, three have happened.
        # Make more than necessary to ensure we get the throttle.
        for i in range(15):
            response = self.client.get(reverse("survey_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_429_TOO_MANY_REQUESTS)
Exemple #27
0
from django.utils import timezone

import boto3
import pandas as pd
import psutil
import requests
from botocore.exceptions import ClientError
from sklearn import preprocessing

from data_refinery_common.job_lookup import PipelineEnum
from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models import ComputedFile, Pipeline
from data_refinery_common.utils import calculate_file_size, calculate_sha1, get_env_variable
from data_refinery_workers.processors import smashing_utils, utils

RESULTS_BUCKET = get_env_variable("S3_RESULTS_BUCKET_NAME",
                                  "refinebio-results-bucket")
S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery")
BODY_HTML = (Path("data_refinery_workers/processors/smasher_email.min.html").
             read_text().replace("\n", ""))
BODY_ERROR_HTML = (
    Path("data_refinery_workers/processors/smasher_email_error.min.html"
         ).read_text().replace("\n", ""))
BYTES_IN_GB = 1024 * 1024 * 1024
logger = get_and_configure_logger(__name__)
### DEBUG ###
logger.setLevel(logging.getLevelName("DEBUG"))

PROCESS_POOL_SIZE = max(1, int(psutil.cpu_count() / 2 - 1))

SCALERS = {
    "MINMAX": preprocessing.MinMaxScaler,
Exemple #28
0
def _find_and_remove_expired_jobs(job_context):
    """ Finds expired jobs and removes their working directories """

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=15)

    job_context['deleted_items'] = []

    for item in os.listdir(LOCAL_ROOT_DIR):

        # Processor job working directories
        if 'processor_job_' in item:

            # TX Index jobs are the only ones who are allowed to hang around
            # after their jobs are finished. They're marked with an _index in their path.
            if '_index' in item:
                continue

            job_id = item.split('processor_job_')[1]

            # Okay, does this job exist?
            try:
                job = ProcessorJob.objects.get(id=job_id)

                # Is this job running?
                try:
                    job_status = nomad_client.job.get_job(
                        job.nomad_job_id)["Status"]

                    # This job is running, don't delete  the working directory.
                    if job_status == "running":
                        continue
                except URLNotFoundNomadException as e:
                    # Nomad has no record of this job, meaning it has likely been GC'd after death.
                    # It can be purged.
                    pass
                except BaseNomadException as e:
                    # If we can't currently access Nomad,
                    # just continue until we can again.
                    continue
                except Exception as e:
                    # This job is likely vanished. No need for this directory.
                    # Or, possibly, another Nomad error outside of BaseNomadException.
                    logger.exception("Janitor found vanished job for " + item +
                                     " - why?")
                    continue
            except ProcessorJob.DoesNotExist:
                # This job has vanished from the DB - clean it up!
                logger.error("Janitor found no record of " + item + " - why?")
                pass
            except Exception:
                # We're unable to connect to the DB right now (or something), so hold onto it for right now.
                logger.exception("Problem finding job record for " + item +
                                 " - why?")
                continue

            # Delete it!
            try:
                to_delete = LOCAL_ROOT_DIR + '/' + item
                logger.info("Janitor deleting " + to_delete,
                            contents=str(os.listdir(to_delete)))
                shutil.rmtree(to_delete)
                job_context['deleted_items'].append(to_delete)
            except Exception as e:
                # This job is likely vanished. No need for this directory.
                pass

        # There may be successful processors
        if 'SRP' in item or 'ERP' in item or 'DRR' in item:
            sub_path = os.path.join(LOCAL_ROOT_DIR, item)
            for sub_item in os.listdir(sub_path):
                try:
                    sample = Sample.objects.get(accession_code=sub_item)
                    if sample.computed_files.count() == 0:
                        # This doesn't have any associated computed files - leave it be.
                        continue
                except Sample.DoesNotExist:
                    # Interesting. This shouldn't happen at all.
                    continue
                except Exception:
                    # We can't contact the DB right now, skip deletion.
                    continue

                try:
                    sub_item_path = os.path.join(sub_path, sub_item)
                    logger.info("Janitor deleting " + sub_item_path,
                                contents=str(os.listdir(sub_item_path)))
                    shutil.rmtree(sub_item_path)
                    job_context['deleted_items'].append(sub_item_path)
                except Exception as e:
                    # This job is likely vanished. No need for this directory.
                    pass

    job_context['success'] = True
    return job_context
from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models import (
    ComputationalResult,
    ComputationalResultAnnotation,
    ComputedFile,
    Pipeline,
    Processor,
    SampleComputedFileAssociation,
    SampleResultAssociation,
    Organism
)
from data_refinery_common.utils import get_env_variable
from data_refinery_workers.processors import utils, smasher


S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery")
S3_COMPENDIA_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery-compendia")
logger = get_and_configure_logger(__name__)


def _prepare_input(job_context: Dict) -> Dict:

    # We're going to use the smasher outside of the smasher.
    # I'm not crazy about this yet. Maybe refactor later,
    # but I need the data now.
    job_context = smasher._prepare_files(job_context)
    job_context = smasher._smash(job_context, how="outer")

    if not 'final_frame' in job_context.keys():
        logger.error("Unable to prepare files for creating compendia.",
            job_id=job_context['job'].id)
Exemple #30
0
# Generated by Django 2.1.5 on 2019-04-04 14:27

import sys

from django.conf import settings
from django.db import migrations

from data_refinery_common.utils import get_env_variable

# We want this to throw if it can't access this, no point in running a
# migration to set everything to a bad value.
S3_QN_TARGET_BUCKET_NAME = get_env_variable("S3_QN_TARGET_BUCKET_NAME")


def update_qn_bucket(apps, schema_editor):
    """Sets the s3_bucket for QN Targets to a bucket just for them.

    Based off of:
    https://simpleisbetterthancomplex.com/tutorial/2017/09/26/how-to-create-django-data-migrations.html

    We can't import the ComputedFile model directly as it may be a newer
    version than this migration expects. We use the historical version.
    """

    if not settings.RUNNING_IN_CLOUD:
        return

    # Pagination isn't necessary here because we have very few QN targets.
    ComputedFile = apps.get_model("data_refinery_common", "ComputedFile")
    for computed_file in ComputedFile.objects.filter(is_qn_target=True):
        if not computed_file.s3_bucket or not computed_file.s3_key: