Beispiel #1
0
    def modify_job_state(self, job_id, new_state):
        """Modify the run state of the job.

    Args:
      job_id: The id of the job.
      new_state: A string representing the new desired state. It could be set to
      either 'JOB_STATE_DONE', 'JOB_STATE_CANCELLED' or 'JOB_STATE_DRAINING'.

    Returns:
      True if the job was modified successfully.
    """
        if new_state == 'JOB_STATE_DONE':
            new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DONE
        elif new_state == 'JOB_STATE_CANCELLED':
            new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_CANCELLED
        elif new_state == 'JOB_STATE_DRAINING':
            new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DRAINING
        else:
            # Other states could only be set by the service.
            return False

        request = dataflow.DataflowProjectsJobsUpdateRequest()
        request.jobId = job_id
        request.projectId = self.google_cloud_options.project
        request.job = dataflow.Job(requestedState=new_state)

        self._client.projects_jobs.Update(request)
        return True
Beispiel #2
0
 def __init__(self, options):
     self.options = options
     self.google_cloud_options = options.view_as(GoogleCloudOptions)
     required_google_cloud_options = [
         'project', 'job_name', 'staging_location', 'temp_location'
     ]
     missing = [
         option for option in required_google_cloud_options
         if not getattr(self.google_cloud_options, option)
     ]
     if missing:
         raise ValueError('Missing required configuration parameters: %s' %
                          missing)
     # Make the staging and temp locations job name and time specific. This is
     # needed to avoid clashes between job submissions using the same staging
     # area or team members using same job names. This method is not entirely
     # foolproof since two job submissions with same name can happen at exactly
     # the same time. However the window is extremely small given that
     # time.time() has at least microseconds granularity. We add the suffix only
     # for GCS staging locations where the potential for such clashes is high.
     if self.google_cloud_options.staging_location.startswith('gs://'):
         path_suffix = '%s.%f' % (self.google_cloud_options.job_name,
                                  time.time())
         self.google_cloud_options.staging_location = utils.path.join(
             self.google_cloud_options.staging_location, path_suffix)
         self.google_cloud_options.temp_location = utils.path.join(
             self.google_cloud_options.temp_location, path_suffix)
     self.proto = dataflow.Job(name=self.google_cloud_options.job_name)
     if self.options.view_as(StandardOptions).streaming:
         self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING
     else:
         self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH
     self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$')
     self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')