Example #1
0
    def modify_job_state(self, job_id, new_state):
        """Modify the run state of the job.

    Args:
      job_id: The id of the job.
      new_state: A string representing the new desired state. It could be set to
      either 'JOB_STATE_DONE', 'JOB_STATE_CANCELLED' or 'JOB_STATE_DRAINING'.

    Returns:
      True if the job was modified successfully.
    """
        if new_state == 'JOB_STATE_DONE':
            new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DONE
        elif new_state == 'JOB_STATE_CANCELLED':
            new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_CANCELLED
        elif new_state == 'JOB_STATE_DRAINING':
            new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DRAINING
        else:
            # Other states could only be set by the service.
            return False

        request = dataflow.DataflowProjectsLocationsJobsUpdateRequest()
        request.jobId = job_id
        request.projectId = self.google_cloud_options.project
        request.location = self.google_cloud_options.region
        request.job = dataflow.Job(requestedState=new_state)

        self._client.projects_locations_jobs.Update(request)
        return True
Example #2
0
    def __init__(self, options, proto_pipeline):
        self.options = options
        self.proto_pipeline = proto_pipeline
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        if not self.google_cloud_options.job_name:
            self.google_cloud_options.job_name = self.default_job_name(
                self.google_cloud_options.job_name)

        required_google_cloud_options = [
            'project', 'job_name', 'temp_location'
        ]
        missing = [
            option for option in required_google_cloud_options
            if not getattr(self.google_cloud_options, option)
        ]
        if missing:
            raise ValueError('Missing required configuration parameters: %s' %
                             missing)

        if not self.google_cloud_options.staging_location:
            logging.info(
                'Defaulting to the temp_location as staging_location: %s',
                self.google_cloud_options.temp_location)
            (self.google_cloud_options.staging_location
             ) = self.google_cloud_options.temp_location

        # Make the staging and temp locations job name and time specific. This is
        # needed to avoid clashes between job submissions using the same staging
        # area or team members using same job names. This method is not entirely
        # foolproof since two job submissions with same name can happen at exactly
        # the same time. However the window is extremely small given that
        # time.time() has at least microseconds granularity. We add the suffix only
        # for GCS staging locations where the potential for such clashes is high.
        if self.google_cloud_options.staging_location.startswith('gs://'):
            path_suffix = '%s.%f' % (self.google_cloud_options.job_name,
                                     time.time())
            self.google_cloud_options.staging_location = FileSystems.join(
                self.google_cloud_options.staging_location, path_suffix)
            self.google_cloud_options.temp_location = FileSystems.join(
                self.google_cloud_options.temp_location, path_suffix)

        self.proto = dataflow.Job(name=self.google_cloud_options.job_name)
        if self.options.view_as(StandardOptions).streaming:
            self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING
        else:
            self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH

        # Labels.
        if self.google_cloud_options.labels:
            self.proto.labels = dataflow.Job.LabelsValue()
            for label in self.google_cloud_options.labels:
                parts = label.split('=', 1)
                key = parts[0]
                value = parts[1] if len(parts) > 1 else ''
                self.proto.labels.additionalProperties.append(
                    dataflow.Job.LabelsValue.AdditionalProperty(key=key,
                                                                value=value))

        self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$')
        self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
Example #3
0
  def test_create_job_returns_existing_job(self):
    pipeline_options = PipelineOptions([
        '--project',
        'test_project',
        '--job_name',
        'test_job_name',
        '--temp_location',
        'gs://test-location/temp',
    ])
    job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
    self.assertTrue(job.proto.clientRequestId)  # asserts non-empty string
    pipeline_options.view_as(GoogleCloudOptions).no_auth = True
    client = apiclient.DataflowApplicationClient(pipeline_options)

    response = dataflow.Job()
    # different clientRequestId from `job`
    response.clientRequestId = "20210821081910123456-1234"
    response.name = 'test_job_name'
    response.id = '2021-08-19_21_18_43-9756917246311111021'

    with mock.patch.object(client._client.projects_locations_jobs,
                           'Create',
                           side_effect=[response]):
      with mock.patch.object(client, 'create_job_description',
                             side_effect=None):
        with self.assertRaises(
            apiclient.DataflowJobAlreadyExistsError) as context:
          client.create_job(job)

        self.assertEqual(
            str(context.exception),
            'There is already active job named %s with id: %s. If you want to '
            'submit a second job, try again by setting a different name using '
            '--job_name.' % ('test_job_name', response.id))
Example #4
0
  def test_update_job_returns_existing_job(self):
    pipeline_options = PipelineOptions([
        '--project',
        'test_project',
        '--job_name',
        'test_job_name',
        '--temp_location',
        'gs://test-location/temp',
        '--region',
        'us-central1',
        '--update',
    ])
    replace_job_id = '2021-08-21_00_00_01-6081497447916622336'
    with mock.patch('apache_beam.runners.dataflow.internal.apiclient.Job.'
                    'job_id_for_name',
                    return_value=replace_job_id) as job_id_for_name_mock:
      job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL)
    job_id_for_name_mock.assert_called_once()

    self.assertTrue(job.proto.clientRequestId)  # asserts non-empty string

    pipeline_options.view_as(GoogleCloudOptions).no_auth = True
    client = apiclient.DataflowApplicationClient(pipeline_options)

    response = dataflow.Job()
    # different clientRequestId from `job`
    response.clientRequestId = "20210821083254123456-1234"
    response.name = 'test_job_name'
    response.id = '2021-08-19_21_29_07-5725551945600207770'

    with mock.patch.object(client, 'create_job_description', side_effect=None):
      with mock.patch.object(client._client.projects_locations_jobs,
                             'Create',
                             side_effect=[response]):

        with self.assertRaises(
            apiclient.DataflowJobAlreadyExistsError) as context:
          client.create_job(job)

      self.assertEqual(
          str(context.exception),
          'The job named %s with id: %s has already been updated into job '
          'id: %s and cannot be updated again.' %
          ('test_job_name', replace_job_id, response.id))