def _GetCompletedJob(self, job_id):
     """See base class."""
     cmd = self.cmd_prefix + [
         'emr', 'describe-step', '--cluster-id', self.cluster_id,
         '--step-id', job_id
     ]
     stdout, stderr, retcode = vm_util.IssueCommand(cmd,
                                                    raise_on_failure=False)
     if retcode:
         if 'ThrottlingException' in stderr:
             logging.warning(
                 'Rate limited while polling EMR step:\n%s\nRetrying.',
                 stderr)
             return None
         else:
             raise errors.VmUtil.IssueCommandError(
                 f'Getting step status failed:\n{stderr}')
     result = json.loads(stdout)
     state = result['Step']['Status']['State']
     if state == 'FAILED':
         raise dpb_service.JobSubmissionError(
             result['Step']['Status']['FailureDetails'])
     if state == 'COMPLETED':
         pending_time = result['Step']['Status']['Timeline'][
             'CreationDateTime']
         start_time = result['Step']['Status']['Timeline']['StartDateTime']
         end_time = result['Step']['Status']['Timeline']['EndDateTime']
         return dpb_service.JobResult(run_time=end_time - start_time,
                                      pending_time=start_time -
                                      pending_time)
Esempio n. 2
0
    def _IsStepDone(self, step_id):
        """Determine whether the step is done.

    Args:
      step_id: The step id to query.

    Returns:
      A dictionary describing the step if the step the step is complete,
          None otherwise.

    Raises:
      JobSubmissionError if job fails.
    """

        cmd = self.cmd_prefix + [
            'emr', 'describe-step', '--cluster-id', self.cluster_id,
            '--step-id', step_id
        ]
        stdout, _, _ = vm_util.IssueCommand(cmd)
        result = json.loads(stdout)
        state = result['Step']['Status']['State']
        if state == 'FAILED':
            raise dpb_service.JobSubmissionError()
        if state == 'COMPLETED':
            return result
        else:
            return None
 def _GetCompletedJob(self, job_id):
   """See base class."""
   cmd = self.cmd_prefix + [
       'emr', 'describe-step', '--cluster-id', self.cluster_id, '--step-id',
       job_id
   ]
   stdout, _, _ = vm_util.IssueCommand(cmd)
   result = json.loads(stdout)
   state = result['Step']['Status']['State']
   if state == 'FAILED':
     raise dpb_service.JobSubmissionError(
         result['Step']['Status']['FailureDetails'])
   if state == 'COMPLETED':
     pending_time = result['Step']['Status']['Timeline']['CreationDateTime']
     start_time = result['Step']['Status']['Timeline']['StartDateTime']
     end_time = result['Step']['Status']['Timeline']['EndDateTime']
     return dpb_service.JobResult(
         run_time=end_time - start_time,
         pending_time=start_time - pending_time)
 def _GetCompletedJob(self, job_id):
     """See base class."""
     job_name, job_run_id = job_id
     cmd = self.cmd_prefix + [
         'glue', 'get-job-run', '--job-name', job_name, '--run-id',
         job_run_id
     ]
     stdout, stderr, retcode = vm_util.IssueCommand(cmd,
                                                    raise_on_failure=False)
     if retcode:
         raise errors.VmUtil.IssueCommandError(
             f'Getting step status failed:\n{stderr}')
     result = json.loads(stdout)
     state = result['JobRun']['JobRunState']
     if state in ('FAILED', 'ERROR', 'TIMEOUT'):
         raise dpb_service.JobSubmissionError(
             result['JobRun'].get('ErrorMessage'))
     if state == 'SUCCEEDED':
         started_on = result['JobRun']['StartedOn']
         completed_on = result['JobRun']['CompletedOn']
         execution_time = result['JobRun']['ExecutionTime']
         return dpb_service.JobResult(run_time=execution_time,
                                      pending_time=completed_on -
                                      started_on - execution_time)
Esempio n. 5
0
    def SubmitJob(self,
                  jarfile=None,
                  classname=None,
                  pyspark_file=None,
                  query_file=None,
                  job_poll_interval=None,
                  job_stdout_file=None,
                  job_arguments=None,
                  job_files=None,
                  job_jars=None,
                  job_type=None,
                  properties=None):
        """See base class."""
        assert job_type
        args = ['jobs', 'submit', job_type]

        if job_type == self.PYSPARK_JOB_TYPE:
            args.append(pyspark_file)

        cmd = self.DataprocGcloudCommand(*args)

        cmd.flags['cluster'] = self.cluster_id
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        job_jars = job_jars or []
        if classname:
            if jarfile:
                # Dataproc does not support both a main class and a main jar so just
                # make the main jar an additional jar instead.
                job_jars.append(jarfile)
            cmd.flags['class'] = classname
        elif jarfile:
            cmd.flags['jar'] = jarfile

        if query_file:
            cmd.flags['file'] = query_file

        if job_files:
            cmd.flags['files'] = ','.join(job_files)
        if job_jars:
            cmd.flags['jars'] = ','.join(job_jars)

        # Dataproc gives as stdout an object describing job execution.
        # Its stderr contains a mix of the stderr of the job, and the
        # stdout of the job.  We set the driver log level to FATAL
        # to suppress those messages, and we can then separate, hopefully
        # the job standard out from the log messages.
        cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level)

        all_properties = self.GetJobProperties()
        all_properties.update(properties or {})
        if all_properties:
            # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping
            cmd.flags['properties'] = '^@^' + '@'.join(
                '{}={}'.format(k, v) for k, v in all_properties.items())

        if job_arguments:
            cmd.additional_flags = ['--'] + job_arguments

        stdout, stderr, retcode = cmd.Issue(timeout=None,
                                            raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        results = json.loads(stdout)
        # Otherwise retcode would not have been 0
        assert results['status']['state'] == 'DONE'
        done_time = GcpDpbDataproc._ParseTime(
            results['status']['stateStartTime'])
        pending_time = None
        start_time = None
        for state in results['statusHistory']:
            if state['state'] == 'PENDING':
                pending_time = GcpDpbDataproc._ParseTime(
                    state['stateStartTime'])
            elif state['state'] == 'RUNNING':
                start_time = GcpDpbDataproc._ParseTime(state['stateStartTime'])

        assert pending_time and start_time and done_time

        return dpb_service.JobResult(
            run_time=(done_time - start_time).total_seconds(),
            pending_time=(start_time - pending_time).total_seconds())
Esempio n. 6
0
    def SubmitJob(self,
                  jarfile=None,
                  classname=None,
                  pyspark_file=None,
                  query_file=None,
                  job_poll_interval=None,
                  job_stdout_file=None,
                  job_arguments=None,
                  job_files=None,
                  job_jars=None,
                  job_type=None,
                  properties=None):
        """See base class."""
        assert job_type
        args = ['batches', 'submit', job_type]
        additional_args = []

        if job_type == self.PYSPARK_JOB_TYPE:
            args.append(pyspark_file)

        cmd = self.DataprocGcloudCommand(*args)

        cmd.flags['batch'] = self.cluster_id
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        job_jars = job_jars or []
        if classname:
            if jarfile:
                # Dataproc does not support both a main class and a main jar so just
                # make the main jar an additional jar instead.
                job_jars.append(jarfile)
            cmd.flags['class'] = classname
        elif jarfile:
            cmd.flags['jar'] = jarfile

        if query_file:
            additional_args += query_file

        if job_files:
            cmd.flags['files'] = ','.join(job_files)
        if job_jars:
            cmd.flags['jars'] = ','.join(job_jars)

        if FLAGS.gce_network_name:
            cmd.flags['network'] = FLAGS.gce_network_name

        if self.dpb_version:
            cmd.flags['version'] = self.dpb_version
        if FLAGS.gcp_dataproc_image:
            cmd.flags['container-image'] = FLAGS.gcp_dataproc_image

        all_properties = self.GetJobProperties()
        all_properties.update(properties or {})
        if all_properties:
            # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping
            cmd.flags['properties'] = '^@^' + '@'.join(
                '{}={}'.format(k, v) for k, v in all_properties.items())

        if job_arguments:
            additional_args += ['--'] + job_arguments
        cmd.additional_flags = additional_args

        _, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        fetch_batch_cmd = self.DataprocGcloudCommand('batches', 'describe',
                                                     self.cluster_id)
        stdout, stderr, retcode = fetch_batch_cmd.Issue(timeout=None,
                                                        raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        results = json.loads(stdout)
        # Otherwise retcode would not have been 0
        assert results['state'] == 'SUCCEEDED'
        done_time = self._ParseTime(results['stateTime'])
        pending_time = None
        start_time = None
        for state in results['stateHistory']:
            if state['state'] == 'PENDING':
                pending_time = self._ParseTime(state['stateStartTime'])
            elif state['state'] == 'RUNNING':
                start_time = self._ParseTime(state['stateStartTime'])

        assert pending_time and start_time and done_time

        return dpb_service.JobResult(
            run_time=(done_time - start_time).total_seconds(),
            pending_time=(start_time - pending_time).total_seconds())