Ejemplo n.º 1
0
def dsub_local(dsub_args):
  """Call dsub appending local-provider required arguments."""

  # pyformat: disable
  return dsub_command.call([
      "--provider", "local",
      "--logging", LOGGING,
      ] + dsub_args)
Ejemplo n.º 2
0
def dsub_google_v2(dsub_args):
    """Call dsub appending google-v2 required arguments."""
    # pyformat: disable
    google_v2_opt_args = [("BOOT_DISK_SIZE", "--boot-disk-size"),
                          ("DISK_SIZE", "--disk-size")]
    # pyformat: enable

    opt_args = []
    for var in google_v2_opt_args:
        val = globals().get(var[0])
        if val:
            opt_args.append(var[1], val)

    # pyformat: disable
    return dsub_command.call([
        "--provider", "google-v2", "--project", PROJECT_ID, "--logging",
        LOGGING, "--regions", "us-central1"
    ] + opt_args + dsub_args)
Ejemplo n.º 3
0
def _run_job(dsub_args, pipeline_args, worker_index=0):
    """Runs a particular job with optional retry logic for preemptibles.

  Args:
    dsub_args: A list of arguments (type string) to pass to dsub for running the
      pipeline.
    pipeline_args: The parsed arguments from running this pipeline.
    worker_index: The index (0-based) of the worker running the job.
  Raises:
    RuntimeError: if there was an error running the pipeline.
  """
    # Allow a bit of time for resources to get allocated. Otherwise, there would
    # be a race condition where all workers would check for available resources on
    # startup and would get assigned to a single zone as they would all see that
    # a particular zone has resources available.
    time.sleep(2 * worker_index)

    def is_unexpected_stop_error(e):
        # The preemption or unexpected stop error has the form:
        # "Error in job <job_name> - code 10: 14: VM <VM> stopped unexpectedly."
        # where the preemption/stop code, 13 or 14, is present after the 'code'
        # keyword.
        return any(
            re.match(
                '.*code.*[ :](%s)[ :].*VM' %
                ('|'.join(_UNEXPECTED_STOP_ERROR_CODES)), str(error))
            for error in e.error_list)

    max_tries = pipeline_args.max_non_preemptible_tries
    use_preemptibles = False
    preemptible_dsub_args = dsub_args[:] + ['--preemptible']
    if pipeline_args.preemptible:
        max_tries += pipeline_args.max_preemptible_tries
        use_preemptibles = True

    num_tries = 0
    while num_tries < max_tries:
        try:
            num_tries += 1
            dsub.call(preemptible_dsub_args if use_preemptibles else dsub_args)
            return
        except dsub_errors.JobExecutionError as e:
            if not is_unexpected_stop_error(e) or num_tries >= max_tries:
                logging.error('Job failed with error %s. Job args: %s',
                              str(e.error_list), dsub_args)
                raise RuntimeError('Job failed with error %s.' %
                                   str(e.error_list))
            elif not use_preemptibles:
                logging.warning(
                    'Job stopped unexpectedly. Retrying with a regular VM '
                    '(%d of %d)', num_tries, max_tries)
            elif num_tries < pipeline_args.max_preemptible_tries:
                logging.info('Job was preempted. Retrying (%d of %d).',
                             num_tries, pipeline_args.max_preemptible_tries)
            else:
                logging.warning(
                    'Job was preempted %d times (max tries is %d). '
                    'Retrying WITHOUT a preemptible VM.', num_tries,
                    pipeline_args.max_preemptible_tries)
                use_preemptibles = False
        except (googleapiclient.errors.Error, dsub_errors.JobError) as e:
            logging.error('Job failed with error %s. Job args: %s', str(e),
                          dsub_args)
            # Note: Need to raise a native exception due to
            # https://bugs.python.org/issue9400.
            raise RuntimeError('Job failed with error %s.' % str(e))
    assert False  # Should not get here.
Ejemplo n.º 4
0
def _run_job(dsub_args, pipeline_args, worker_index=0):
  """Runs a particular job with optional retry logic for preemptibles.

  Args:
    dsub_args: A list of arguments (type string) to pass to dsub for running the
      pipeline.
    pipeline_args: The parsed arguments from running this pipeline.
    worker_index: The index (0-based) of the worker running the job.
  Raises:
    RuntimeError: if there was an error running the pipeline.
  """
  # Allow a bit of time for resources to get allocated. Otherwise, there would
  # be a race condition where all workers would check for available resources on
  # startup and would get assigned to a single zone as they would all see that
  # a particular zone has resources available.
  time.sleep(2 * worker_index)

  def is_unexpected_stop_error(e):
    # The preemption or unexpected stop error has the form:
    # "Error in job <job_name> - code 10: 14: VM <VM> stopped unexpectedly."
    # where the preemption/stop code, 13 or 14, is present after the 'code'
    # keyword.
    return any(
        re.match('.*code.*[ :](%s)[ :].*VM' % (
            '|'.join(_UNEXPECTED_STOP_ERROR_CODES)), str(error))
        for error in e.error_list)

  max_tries = pipeline_args.max_non_preemptible_tries
  use_preemptibles = False
  preemptible_dsub_args = dsub_args[:] + ['--preemptible']
  if pipeline_args.preemptible:
    max_tries += pipeline_args.max_preemptible_tries
    use_preemptibles = True

  num_tries = 0
  while num_tries < max_tries:
    try:
      num_tries += 1
      dsub.call(preemptible_dsub_args if use_preemptibles else dsub_args)
      return
    except dsub_errors.JobExecutionError as e:
      if not is_unexpected_stop_error(e) or num_tries >= max_tries:
        logging.error('Job failed with error %s. Job args: %s', str(
            e.error_list), dsub_args)
        raise RuntimeError('Job failed with error %s.' % str(e.error_list))
      elif not use_preemptibles:
        logging.warning('Job stopped unexpectedly. Retrying with a regular VM '
                        '(%d of %d)', num_tries, max_tries)
      elif num_tries < pipeline_args.max_preemptible_tries:
        logging.info('Job was preempted. Retrying (%d of %d).', num_tries,
                     pipeline_args.max_preemptible_tries)
      else:
        logging.warning('Job was preempted %d times (max tries is %d). '
                        'Retrying WITHOUT a preemptible VM.', num_tries,
                        pipeline_args.max_preemptible_tries)
        use_preemptibles = False
    except (googleapiclient.errors.Error, dsub_errors.JobError) as e:
      logging.error('Job failed with error %s. Job args: %s', str(e), dsub_args)
      # Note: Need to raise a native exception due to
      # https://bugs.python.org/issue9400.
      raise RuntimeError('Job failed with error %s.' % str(e))
  assert False  # Should not get here.