def dsub_local(dsub_args): """Call dsub appending local-provider required arguments.""" # pyformat: disable return dsub_command.call([ "--provider", "local", "--logging", LOGGING, ] + dsub_args)
def dsub_google_v2(dsub_args): """Call dsub appending google-v2 required arguments.""" # pyformat: disable google_v2_opt_args = [("BOOT_DISK_SIZE", "--boot-disk-size"), ("DISK_SIZE", "--disk-size")] # pyformat: enable opt_args = [] for var in google_v2_opt_args: val = globals().get(var[0]) if val: opt_args.append(var[1], val) # pyformat: disable return dsub_command.call([ "--provider", "google-v2", "--project", PROJECT_ID, "--logging", LOGGING, "--regions", "us-central1" ] + opt_args + dsub_args)
def _run_job(dsub_args, pipeline_args, worker_index=0): """Runs a particular job with optional retry logic for preemptibles. Args: dsub_args: A list of arguments (type string) to pass to dsub for running the pipeline. pipeline_args: The parsed arguments from running this pipeline. worker_index: The index (0-based) of the worker running the job. Raises: RuntimeError: if there was an error running the pipeline. """ # Allow a bit of time for resources to get allocated. Otherwise, there would # be a race condition where all workers would check for available resources on # startup and would get assigned to a single zone as they would all see that # a particular zone has resources available. time.sleep(2 * worker_index) def is_unexpected_stop_error(e): # The preemption or unexpected stop error has the form: # "Error in job <job_name> - code 10: 14: VM <VM> stopped unexpectedly." # where the preemption/stop code, 13 or 14, is present after the 'code' # keyword. return any( re.match( '.*code.*[ :](%s)[ :].*VM' % ('|'.join(_UNEXPECTED_STOP_ERROR_CODES)), str(error)) for error in e.error_list) max_tries = pipeline_args.max_non_preemptible_tries use_preemptibles = False preemptible_dsub_args = dsub_args[:] + ['--preemptible'] if pipeline_args.preemptible: max_tries += pipeline_args.max_preemptible_tries use_preemptibles = True num_tries = 0 while num_tries < max_tries: try: num_tries += 1 dsub.call(preemptible_dsub_args if use_preemptibles else dsub_args) return except dsub_errors.JobExecutionError as e: if not is_unexpected_stop_error(e) or num_tries >= max_tries: logging.error('Job failed with error %s. Job args: %s', str(e.error_list), dsub_args) raise RuntimeError('Job failed with error %s.' % str(e.error_list)) elif not use_preemptibles: logging.warning( 'Job stopped unexpectedly. Retrying with a regular VM ' '(%d of %d)', num_tries, max_tries) elif num_tries < pipeline_args.max_preemptible_tries: logging.info('Job was preempted. Retrying (%d of %d).', num_tries, pipeline_args.max_preemptible_tries) else: logging.warning( 'Job was preempted %d times (max tries is %d). ' 'Retrying WITHOUT a preemptible VM.', num_tries, pipeline_args.max_preemptible_tries) use_preemptibles = False except (googleapiclient.errors.Error, dsub_errors.JobError) as e: logging.error('Job failed with error %s. Job args: %s', str(e), dsub_args) # Note: Need to raise a native exception due to # https://bugs.python.org/issue9400. raise RuntimeError('Job failed with error %s.' % str(e)) assert False # Should not get here.
def _run_job(dsub_args, pipeline_args, worker_index=0): """Runs a particular job with optional retry logic for preemptibles. Args: dsub_args: A list of arguments (type string) to pass to dsub for running the pipeline. pipeline_args: The parsed arguments from running this pipeline. worker_index: The index (0-based) of the worker running the job. Raises: RuntimeError: if there was an error running the pipeline. """ # Allow a bit of time for resources to get allocated. Otherwise, there would # be a race condition where all workers would check for available resources on # startup and would get assigned to a single zone as they would all see that # a particular zone has resources available. time.sleep(2 * worker_index) def is_unexpected_stop_error(e): # The preemption or unexpected stop error has the form: # "Error in job <job_name> - code 10: 14: VM <VM> stopped unexpectedly." # where the preemption/stop code, 13 or 14, is present after the 'code' # keyword. return any( re.match('.*code.*[ :](%s)[ :].*VM' % ( '|'.join(_UNEXPECTED_STOP_ERROR_CODES)), str(error)) for error in e.error_list) max_tries = pipeline_args.max_non_preemptible_tries use_preemptibles = False preemptible_dsub_args = dsub_args[:] + ['--preemptible'] if pipeline_args.preemptible: max_tries += pipeline_args.max_preemptible_tries use_preemptibles = True num_tries = 0 while num_tries < max_tries: try: num_tries += 1 dsub.call(preemptible_dsub_args if use_preemptibles else dsub_args) return except dsub_errors.JobExecutionError as e: if not is_unexpected_stop_error(e) or num_tries >= max_tries: logging.error('Job failed with error %s. Job args: %s', str( e.error_list), dsub_args) raise RuntimeError('Job failed with error %s.' % str(e.error_list)) elif not use_preemptibles: logging.warning('Job stopped unexpectedly. Retrying with a regular VM ' '(%d of %d)', num_tries, max_tries) elif num_tries < pipeline_args.max_preemptible_tries: logging.info('Job was preempted. Retrying (%d of %d).', num_tries, pipeline_args.max_preemptible_tries) else: logging.warning('Job was preempted %d times (max tries is %d). ' 'Retrying WITHOUT a preemptible VM.', num_tries, pipeline_args.max_preemptible_tries) use_preemptibles = False except (googleapiclient.errors.Error, dsub_errors.JobError) as e: logging.error('Job failed with error %s. Job args: %s', str(e), dsub_args) # Note: Need to raise a native exception due to # https://bugs.python.org/issue9400. raise RuntimeError('Job failed with error %s.' % str(e)) assert False # Should not get here.