Esempi in Python per Nomad, esempi in Python per nomad.Nomad

Esempio n. 1

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def retry_hung_downloader_jobs() -> None:
    """Retry downloader jobs that were started but never finished."""
    potentially_hung_jobs = DownloaderJob.objects.filter(
        success=None,
        retried=False,
        end_time=None,
        start_time__isnull=False,
        no_retry=False).prefetch_related("original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    hung_jobs = []
    for job in potentially_hung_jobs:
        try:
            job_status = nomad_client.job.get_job(job.nomad_job_id)["Status"]
            if job_status != "running":
                # Make sure it didn't finish since our original query.
                job.refresh_from_db()
                if job.end_time is None:
                    hung_jobs.append(job)
        except URLNotFoundNomadException:
            hung_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Downloader Job.",
                             downloader_job=job.id)

    if hung_jobs:
        logger.info(
            "Handling hung (started-but-never-finished) downloader jobs!",
            jobs_count=len(hung_jobs))
        handle_downloader_jobs(hung_jobs)

Esempio n. 2

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def handle_survey_jobs(jobs: List[SurveyJob]) -> None:
    """For each job in jobs, either retry it or log it."""

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    # Maximum number of total jobs running at a time.
    # We do this now rather than import time for testing purposes.
    MAX_TOTAL_JOBS = int(
        get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS))
    len_all_jobs = len(nomad_client.jobs.get_jobs())
    if len_all_jobs >= MAX_TOTAL_JOBS:
        logger.info("Not requeuing job until we're running fewer jobs.")
        return False

    jobs_dispatched = 0
    for count, job in enumerate(jobs):
        if job.num_retries < MAX_NUM_RETRIES:
            requeue_survey_job(job)
            jobs_dispatched = jobs_dispatched + 1
        else:
            handle_repeated_failure(job)

        if (count % 100) == 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

        if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS:
            logger.info(
                "We hit the maximum total jobs ceiling, so we're not handling any more survey jobs now."
            )
            return False

    return True

Esempio n. 3

0

Mostra file

File: subsystems.py Progetto: Tyler-Duckworth/pyfrank-poole

    def __init__(self):
        self.drive = new Nomad()
        self.oi = new OI()
        self.ds = DriverStation.getInstance()

        self.infont = NetworkTables.getTable('info')
        self.sensors = new Sensors()

Esempio n. 4

0

Mostra file

File: manual_unknown.py Progetto: kambizr77/jira_automation

def data_create():
    nomad = Nomad()
    ip_range = pdu_ip_range_finder()
    # cmd=os.system('./un.sh > all_unknown.txt')
    with open('manual_unknown.txt', 'r') as f:
        hosts = f.readlines()
    ip = IpChecker()
    bucket = ip.ip_check()
    oob_ips = oob_ip_range_finder()
    hosts_ticket, owners = owner_finder(hosts)
    data = {}
    emails = []
    for key, value in hosts_ticket.items():
        nomad_data = nomad.check_host(key)
        for keys, values in bucket.items():
            i = len(values)
            if i > 0:
                j = 0
                temp_ip = []
                while i > j:
                    temp_ip.append(str(values[j]))
                    j += 1
                for ip in temp_ip:
                    if key in ip and key not in ip_range and key not in oob_ips:
                        owner_key = keys.split('_')[0]
                        owner_name = owners[owner_key][keys]['owner']
                        if owners[owner_key][keys]['email'] != 'NA' and owners[
                                owner_key][keys]['email'] not in emails:
                            emails.append(owners[owner_key][keys]['email'])
                        data.update(
                            {key: [keys,
                                   str(owner_name), value, nomad_data]})
                    elif key in ip_range:
                        owner_name = 'Clay Alvord'
                        emails.append('*****@*****.**')
                        data.update({
                            key:
                            ['PDU-OOB',
                             str(owner_name), value, nomad_data]
                        })
                    elif key in oob_ips:
                        owner_name = 'Clay Alvord'
                        emails.append('*****@*****.**')
                        data.update(
                            {key: ['OOB',
                                   str(owner_name), value, nomad_data]})
    return data, emails

Esempio n. 5

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def retry_hung_processor_jobs() -> None:
    """Retry processor jobs that were started but never finished.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    potentially_hung_jobs = ProcessorJob.objects.filter(
        success=None,
        retried=False,
        end_time=None,
        start_time__isnull=False,
        no_retry=False,
        volume_index__in=active_volumes).exclude(
            pipeline_applied="JANITOR").prefetch_related(
                "original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    hung_jobs = []
    for job in potentially_hung_jobs:
        try:
            job_status = nomad_client.job.get_job(job.nomad_job_id)["Status"]
            if job_status != "running":
                # Make sure it didn't finish since our original query.
                job.refresh_from_db()
                if job.end_time is None:
                    hung_jobs.append(job)
        except URLNotFoundNomadException:
            hung_jobs.append(job)
        except TypeError:
            # Almost certainly a python-nomad issue:
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/job.py", line 63, in get_job
            #   return self.request(id, method="get").json()
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 74, in request
            #   endpoint = self._endpoint_builder(self.ENDPOINT, *args)
            # File "/usr/local/lib/python3.5/dist-packages/nomad/api/base.py", line 28, in _endpoint_builder
            #   u = "/".join(args)
            # TypeError: sequence item 1: expected str instance, NoneType found
            logger.info("Couldn't query Nomad about Processor Job.",
                        processor_job=job.id)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Processor Job.",
                             processor_job=job.id)

    if hung_jobs:
        logger.info(
            "Handling hung (started-but-never-finished) processor jobs!",
            len_jobs=len(hung_jobs))
        handle_processor_jobs(hung_jobs)

Esempio n. 6

0

Mostra file

File: main.py Progetto: jeffwecan/terraform-aws-nomad-asg-drain-lambda

def handle_asg_lifecycle_event(event, context) -> dict:
    """Handle Nomad node ASG lifecycle events.

    Note: Currently only intended to handle "terminate lifecycle" actions.

    Args:
        event: ASG Lifecycle event data provided by AWS.
        context: Runtime information for the lambda.

    Raises:
        NotImplementedError: Raised when we receive an event with a "detail-type" other than:
        "EC2 Instance-terminate Lifecycle Action".

    Returns:
        dict: Includes both relevant details from the original event (so that this function can call itself with its
        own outputs) as well as information about any actions taken (e.g., if the instance was ready for termination, etc.)
    """
    if event['detail-type'] != 'EC2 Instance-terminate Lifecycle Action':
        raise NotImplementedError('Only EC2 instance terminate lifecycle notifications currently supported')

    # Map the terminating EC2 instance's ID to the corresponding nomad node ID
    nomad_api = Nomad(
        host=os.environ['NOMAD_ADDR'],
        timeout=60,
    )
    node_id = get_node_id_by_instance_id(
        nomad_api=nomad_api,
        instance_id=event['detail']['EC2InstanceId'],
    )

    # Ensure the node associated with the terminating instance is draining jobs
    ensure_node_is_draining(
        nomad_api=nomad_api,
        node_id=node_id,
    )

    # Check to see if we're ready to terminate the associated nomad node.
    ready_for_termination = is_node_ready_for_termination(
        nomad_api=nomad_api,
        node_id=node_id,
    )
    max_wait_time_exceeded = is_max_wait_time_exceeded(event['time'])
    send_asg_lifecycle_notifications(
        event_detail=event['detail'],
        complete_action=ready_for_termination or max_wait_time_exceeded,
    )

    return {
        'detail-type': event['detail-type'],
        'time': event['time'],
        'detail': event['detail'],
        'ready_for_termination': ready_for_termination,
        'max_wait_time_exceeded': max_wait_time_exceeded,
    }

Esempio n. 7

0

Mostra file

File: organism_shepherd.py Progetto: erflynn/refinebio

    def handle(self, *args, **options):
        """Requeues all unprocessed RNA-Seq samples for an organism.
        """
        if options["organism_name"] is None:
            logger.error("You must specify an organism-name.")
            sys.exit(1)
        else:
            organism_name = options["organism_name"]

        organism = Organism.objects.get(name=organism_name)

        prioritized_job_list = build_prioritized_jobs_list(organism)

        if not len(prioritized_job_list):
            logger.info(
                "Found no samples that need to be processed. I guess I'm done!"
            )
            sys.exit(0)

        logger.info(
            "Found %d samples that need to be processed. Beginning to queue jobs!",
            len(prioritized_job_list),
        )

        nomad_host = get_env_variable("NOMAD_HOST")
        nomad_port = get_env_variable("NOMAD_PORT", "4646")
        nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)

        while len(prioritized_job_list) > 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

            num_short_from_max = MAX_JOBS_FOR_THIS_MODE - len_all_jobs
            if num_short_from_max > 0:
                # We don't want these jobs to sit in our queue because
                # the volume we assigned isn't available, so only use
                # active volumes. Also in order to spread them around
                # do so randomly. We don't want to hammer Nomad to
                # get the active volumes though, so just do it once
                # per 5 minute loop.
                volume_index = random.choice(list(get_active_volumes()))
                for i in range(num_short_from_max):
                    if len(prioritized_job_list) > 0:
                        requeue_job(prioritized_job_list.pop(0), volume_index)

            # Wait 5 minutes in between queuing additional work to
            # give it time to actually get done.
            if len(prioritized_job_list) > 0:
                logger.info("Sleeping for 5 minutes while jobs get done.")
                time.sleep(300)

        logger.info(
            "Successfully requeued all jobs for unprocessed %s samples.",
            organism_name)

Esempio n. 8

0

Mostra file

    def kill_nomad_job(self) -> bool:
        if not self.nomad_job_id:
            return False

        try:
            nomad_host = get_env_variable("NOMAD_HOST")
            nomad_port = get_env_variable("NOMAD_PORT", "4646")
            nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
            nomad_client.job.deregister_job(self.nomad_job_id)
        except nomad.api.exceptions.BaseNomadException:
            return False

        return True

Esempio n. 9

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def retry_lost_survey_jobs() -> None:
    """Retry survey jobs which never even got started for too long."""
    potentially_lost_jobs = SurveyJob.objects.filter(
        success=None,
        retried=False,
        start_time=None,
        end_time=None,
        no_retry=False).order_by('pk')

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    lost_jobs = []

    for job in potentially_lost_jobs:
        try:
            # Surveyor jobs didn't always have nomad_job_ids. If they
            # don't have one then by this point they've definitely died.
            if job.nomad_job_id:
                job_status = nomad_client.job.get_job(
                    job.nomad_job_id)["Status"]
            else:
                job_status = "absent"

            # If the job is still pending, then it makes sense that it
            # hasn't started and if it's running then it may not have
            # been able to mark the job record as started yet.
            if job_status != "pending" and job_status != "running":
                logger.debug((
                    "Determined that a survey job needs to be requeued because its"
                    " Nomad Job's status is: %s."),
                             job_status,
                             job_id=job.id)
                lost_jobs.append(job)
        except URLNotFoundNomadException:
            logger.debug(
                ("Determined that a survey job needs to be requeued because "
                 "querying for its Nomad job failed."),
                job_id=job.id)
            lost_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Processor Job.",
                             survey_job=job.id)

    if lost_jobs:
        logger.info("Handling lost (never-started) survey jobs!",
                    len_jobs=len(lost_jobs))
        handle_survey_jobs(lost_jobs)

Esempio n. 10

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def handle_processor_jobs(jobs: List[ProcessorJob]) -> None:
    """For each job in jobs, either retry it or log it."""

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    # Maximum number of total jobs running at a time.
    # We do this now rather than import time for testing purposes.
    MAX_TOTAL_JOBS = int(
        get_env_variable_gracefully("MAX_TOTAL_JOBS", DEFAULT_MAX_JOBS))
    len_all_jobs = len(nomad_client.jobs.get_jobs())
    if len_all_jobs >= MAX_TOTAL_JOBS:
        logger.info("Not requeuing job until we're running fewer jobs.")
        return False

    # We want zebrafish data first, then hgu133plus2, then data
    # related to pediatric cancer, then to finish salmon experiments
    # that are close to completion.
    # Each function moves the jobs it prioritizes to the front of the
    # list, so apply them in backwards order.
    # jobs = prioritize_salmon_jobs(jobs)
    # jobs = prioritize_jobs_by_accession(jobs, PEDIATRIC_ACCESSION_LIST)
    # jobs = prioritize_jobs_by_accession(jobs, HGU133PLUS2_ACCESSION_LIST)
    # jobs = prioritize_zebrafish_jobs(jobs)

    jobs_dispatched = 0
    for count, job in enumerate(jobs):
        if job.num_retries < MAX_NUM_RETRIES:
            requeue_processor_job(job)
            jobs_dispatched = jobs_dispatched + 1
        else:
            handle_repeated_failure(job)

        if (count % 100) == 0:
            len_all_jobs = len(nomad_client.jobs.get_jobs())

        if (jobs_dispatched + len_all_jobs) >= MAX_TOTAL_JOBS:
            logger.info(
                "We hit the maximum total jobs ceiling, so we're not handling any more processor jobs now."
            )
            return False

    return True

Esempio n. 11

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def retry_hung_survey_jobs() -> None:
    """Retry survey jobs that were started but never finished."""
    potentially_hung_jobs = SurveyJob.objects.filter(
        success=None,
        retried=False,
        end_time=None,
        start_time__isnull=False,
        no_retry=False).order_by('pk')

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    hung_jobs = []
    for job in potentially_hung_jobs:
        try:
            # Surveyor jobs didn't always have nomad_job_ids. If they
            # don't have one then by this point they've definitely died.
            if job.nomad_job_id:
                job_status = nomad_client.job.get_job(
                    job.nomad_job_id)["Status"]
            else:
                job_status = "absent"

            if job_status != "running":
                # Make sure it didn't finish since our original query.
                job.refresh_from_db()
                if job.end_time is None:
                    hung_jobs.append(job)
        except URLNotFoundNomadException:
            hung_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about SurveyJob Job.",
                             survey_job=job.id)

    if hung_jobs:
        logger.info("Handling hung (started-but-never-finished) survey jobs!",
                    len_jobs=len(hung_jobs))
        handle_survey_jobs(hung_jobs)

Esempio n. 12

0

Mostra file

File: janitor.py Progetto: Quiltomics/refinebio

def _find_and_remove_expired_jobs(job_context):
    """ Finds expired jobs and removes their working directories """

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=15)

    job_context['deleted_items'] = []

    for item in os.listdir(LOCAL_ROOT_DIR):

        # Processor job working directories
        if 'processor_job_' in item:

            # TX Index jobs are the only ones who are allowed to hang around
            # after their jobs are finished. They're marked with an _index in their path.
            if '_index' in item:
                continue

            job_id = item.split('processor_job_')[1]

            # Okay, does this job exist?
            try:
                job = ProcessorJob.objects.get(id=job_id)

                # Is this job running?
                try:
                    job_status = nomad_client.job.get_job(
                        job.nomad_job_id)["Status"]

                    # This job is running, don't delete  the working directory.
                    if job_status == "running":
                        continue
                except URLNotFoundNomadException as e:
                    # Nomad has no record of this job, meaning it has likely been GC'd after death.
                    # It can be purged.
                    pass
                except BaseNomadException as e:
                    # If we can't currently access Nomad,
                    # just continue until we can again.
                    continue
                except Exception as e:
                    # This job is likely vanished. No need for this directory.
                    # Or, possibly, another Nomad error outside of BaseNomadException.
                    logger.exception("Janitor found vanished job for " + item +
                                     " - why?")
                    continue
            except ProcessorJob.DoesNotExist:
                # This job has vanished from the DB - clean it up!
                logger.error("Janitor found no record of " + item + " - why?")
                pass
            except Exception:
                # We're unable to connect to the DB right now (or something), so hold onto it for right now.
                logger.exception("Problem finding job record for " + item +
                                 " - why?")
                continue

            # Delete it!
            try:
                to_delete = LOCAL_ROOT_DIR + '/' + item
                logger.info("Janitor deleting " + to_delete,
                            contents=str(os.listdir(to_delete)))
                shutil.rmtree(to_delete)
                job_context['deleted_items'].append(to_delete)
            except Exception as e:
                # This job is likely vanished. No need for this directory.
                pass

        # There may be successful processors
        if 'SRP' in item or 'ERP' in item or 'DRR' in item:
            sub_path = os.path.join(LOCAL_ROOT_DIR, item)
            for sub_item in os.listdir(sub_path):
                try:
                    sample = Sample.objects.get(accession_code=sub_item)
                    if sample.computed_files.count() == 0:
                        # This doesn't have any associated computed files - leave it be.
                        continue
                except Sample.DoesNotExist:
                    # Interesting. This shouldn't happen at all.
                    continue
                except Exception:
                    # We can't contact the DB right now, skip deletion.
                    continue

                try:
                    sub_item_path = os.path.join(sub_path, sub_item)
                    logger.info("Janitor deleting " + sub_item_path,
                                contents=str(os.listdir(sub_item_path)))
                    shutil.rmtree(sub_item_path)
                    job_context['deleted_items'].append(sub_item_path)
                except Exception as e:
                    # This job is likely vanished. No need for this directory.
                    pass

    job_context['success'] = True
    return job_context

Esempio n. 13

0

Mostra file

File: profiling.py Progetto: nobillygreen/nomad

import cProfile
from nomad import Nomad

if __name__ == "__main__":
    s = '251131511325113151135611376113251131511325113151135611376113'
    model = Nomad([s])
    # model.find_best_grammar(100)
    cProfile.run('model.find_best_grammar(100, False)', )
    # cProfile.run('model.find_best_grammar(100, False)')

Esempio n. 14

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def retry_lost_processor_jobs() -> None:
    """Retry processor jobs which never even got started for too long.

    Ignores Janitor jobs since they are queued every half hour anyway."""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    potentially_lost_jobs = ProcessorJob.objects.filter(
        success=None,
        retried=False,
        start_time=None,
        end_time=None,
        no_retry=False,
        volume_index__in=active_volumes).exclude(
            pipeline_applied="JANITOR").prefetch_related(
                "original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=5)
    lost_jobs = []
    for job in potentially_lost_jobs:
        try:
            if job.nomad_job_id:
                job_status = nomad_client.job.get_job(
                    job.nomad_job_id)["Status"]
                # If the job is still pending, then it makes sense that it
                # hasn't started and if it's running then it may not have
                # been able to mark the job record as started yet.
                if job_status != "pending" and job_status != "running":
                    logger.debug((
                        "Determined that a processor job needs to be requeued because its"
                        " Nomad Job's status is: %s."),
                                 job_status,
                                 job_id=job.id)
                    lost_jobs.append(job)
            else:
                # If there is no nomad_job_id field set, we could be
                # in the small window where the job was created but
                # hasn't yet gotten a chance to be queued.
                # If this job really should be restarted we'll get it in the next loop.
                if timezone.now() - job.created_at > MIN_LOOP_TIME:
                    lost_jobs.append(job)
        except URLNotFoundNomadException:
            logger.debug((
                "Determined that a processor job needs to be requeued because "
                "querying for its Nomad job failed: "),
                         job_id=job.id)
            lost_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Processor Job.",
                             processor_job=job.id)

    if lost_jobs:
        logger.info("Handling lost (never-started) processor jobs!",
                    len_jobs=len(lost_jobs))
        handle_processor_jobs(lost_jobs)

Esempio n. 15

0

Mostra file

File: nomad_artifacts.py Progetto: grapl-security/grapl

def _get_nomad_client(namespace: Optional[str] = None) -> Nomad:
    address = os.getenv("NOMAD_ADDRESS") or "http://localhost:4646"
    assert address.startswith(
        "http"), f"Your nomad address needs a protocol: {address}"
    nomad_client = Nomad(address=address, timeout=10, namespace=namespace)
    return nomad_client

Esempio n. 16

0

Mostra file

File: main.py Progetto: Quiltomics/refinebio

def retry_lost_downloader_jobs() -> None:
    """Retry downloader jobs that went too long without being started.

    Idea: at some point this function could integrate with the spot
    instances to determine if jobs are hanging due to a lack of
    instances. A naive time-based implementation like this could end
    up retrying every single queued job if there were a long period
    during which the price of spot instance is higher than our bid
    price.
    """
    potentially_lost_jobs = DownloaderJob.objects.filter(
        success=None,
        retried=False,
        start_time=None,
        end_time=None,
        no_retry=False).prefetch_related("original_files__samples")

    nomad_host = get_env_variable("NOMAD_HOST")
    nomad_port = get_env_variable("NOMAD_PORT", "4646")
    nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)
    lost_jobs = []
    for job in potentially_lost_jobs:
        try:
            if job.nomad_job_id:
                job_status = nomad_client.job.get_job(
                    job.nomad_job_id)["Status"]
                # If the job is still pending, then it makes sense that it
                # hasn't started and if it's running then it may not have
                # been able to mark the job record as started yet.
                if job_status != "pending" and job_status != "running":
                    logger.debug((
                        "Determined that a downloader job needs to be requeued because its"
                        " Nomad Job's status is: %s."),
                                 job_status,
                                 job_id=job.id)
                    lost_jobs.append(job)
            else:
                # If there is no nomad_job_id field set, we could be
                # in the small window where the job was created but
                # hasn't yet gotten a chance to be queued.
                # If this job really should be restarted we'll get it in the next loop.
                if timezone.now() - job.created_at > MIN_LOOP_TIME:
                    lost_jobs.append(job)
        except socket.timeout:
            logger.info("Timeout connecting to Nomad - is Nomad down?",
                        job_id=job.id)
        except URLNotFoundNomadException:
            logger.debug((
                "Determined that a downloader job needs to be requeued because "
                "querying for its Nomad job failed: "),
                         job_id=job.id)
            lost_jobs.append(job)
        except nomad.api.exceptions.BaseNomadException:
            raise
        except Exception:
            logger.exception("Couldn't query Nomad about Downloader Job.",
                             downloader_job=job.id)

    if lost_jobs:
        logger.info("Handling lost (never-started) downloader jobs!",
                    len_jobs=len(lost_jobs))
        handle_downloader_jobs(lost_jobs)

Esempio n. 17

0

Mostra file

    def handle(self, *args, **options):
        nomad_host = get_env_variable("NOMAD_HOST")
        nomad_port = get_env_variable("NOMAD_PORT", "4646")
        nomad_client = Nomad(nomad_host, port=int(nomad_port), timeout=30)

        with open("config/all_rna_seq_accessions.txt") as accession_list_file:
            all_rna_accessions = [line.strip() for line in accession_list_file]

        with open(
                "config/all_microarray_accessions.txt") as accession_list_file:
            all_microarray_accessions = [
                line.strip() for line in accession_list_file
            ]

        all_accessions = all_microarray_accessions + all_rna_accessions

        BATCH_SIZE = 1000
        batch_index = 0
        batch_accessions = all_accessions[0:BATCH_SIZE]

        fed_accessions = []

        while batch_accessions:
            logger.info(
                "Looping through another batch of 1000 experiments, starting with accession code: %s",
                batch_accessions[0],
            )

            # Check against surveyed accessions table to prevent resurveying
            surveyed_experiments = SurveyedAccession.objects.filter(
                accession_code__in=batch_accessions).values("accession_code")

            surveyed_accessions = [
                experiment["accession_code"]
                for experiment in surveyed_experiments
            ]

            missing_accessions = set(batch_accessions) - set(
                surveyed_accessions)
            while len(missing_accessions) > 0:
                try:
                    all_surveyor_jobs = nomad_client.jobs.get_jobs(
                        prefix="SURVEYOR")

                    num_surveyor_jobs = 0
                    for job in all_surveyor_jobs:
                        if job["ParameterizedJob"] and job["JobSummary"].get(
                                "Children", None):
                            num_surveyor_jobs = (
                                num_surveyor_jobs +
                                job["JobSummary"]["Children"]["Pending"])
                            num_surveyor_jobs = (
                                num_surveyor_jobs +
                                job["JobSummary"]["Children"]["Running"])
                except Exception:
                    logger.exception(
                        "Exception caught counting surveyor jobs!")
                    # Probably having trouble communicating with Nomad, let's try again next loop.
                    continue

                if num_surveyor_jobs < 15:
                    accession_code = missing_accessions.pop()
                    try:
                        queue_surveyor_for_accession(accession_code)
                        fed_accessions.append(accession_code)
                        time.sleep(30)
                    except Exception:
                        # We don't want to stop, gotta keep feeding the beast!!!!
                        logger.exception(
                            "Exception caught while looping through all accessions!",
                            accession_code=accession_code,
                        )
                else:
                    # Do it here so we don't sleep when there's an exception
                    time.sleep(30)

            # Bulk insert fed_accessions to SurveyedAccession
            new_surveyed_accessions = []
            current_time = timezone.now()

            for accession in fed_accessions:
                new_surveyed_accessions.append(
                    SurveyedAccession(accession_code=accession,
                                      created_at=current_time))

            SurveyedAccession.objects.bulk_create(new_surveyed_accessions)
            fed_accessions = []

            batch_index += 1
            if batch_index * BATCH_SIZE >= len(all_accessions):
                break

            batch_start = batch_index * BATCH_SIZE
            batch_end = batch_start + BATCH_SIZE
            batch_accessions = all_accessions[batch_start:batch_end]