def modify_job_state(self, job_id, new_state): """Modify the run state of the job. Args: job_id: The id of the job. new_state: A string representing the new desired state. It could be set to either 'JOB_STATE_DONE', 'JOB_STATE_CANCELLED' or 'JOB_STATE_DRAINING'. Returns: True if the job was modified successfully. """ if new_state == 'JOB_STATE_DONE': new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DONE elif new_state == 'JOB_STATE_CANCELLED': new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_CANCELLED elif new_state == 'JOB_STATE_DRAINING': new_state = dataflow.Job.RequestedStateValueValuesEnum.JOB_STATE_DRAINING else: # Other states could only be set by the service. return False request = dataflow.DataflowProjectsLocationsJobsUpdateRequest() request.jobId = job_id request.projectId = self.google_cloud_options.project request.location = self.google_cloud_options.region request.job = dataflow.Job(requestedState=new_state) self._client.projects_locations_jobs.Update(request) return True
def __init__(self, options, proto_pipeline): self.options = options self.proto_pipeline = proto_pipeline self.google_cloud_options = options.view_as(GoogleCloudOptions) if not self.google_cloud_options.job_name: self.google_cloud_options.job_name = self.default_job_name( self.google_cloud_options.job_name) required_google_cloud_options = [ 'project', 'job_name', 'temp_location' ] missing = [ option for option in required_google_cloud_options if not getattr(self.google_cloud_options, option) ] if missing: raise ValueError('Missing required configuration parameters: %s' % missing) if not self.google_cloud_options.staging_location: logging.info( 'Defaulting to the temp_location as staging_location: %s', self.google_cloud_options.temp_location) (self.google_cloud_options.staging_location ) = self.google_cloud_options.temp_location # Make the staging and temp locations job name and time specific. This is # needed to avoid clashes between job submissions using the same staging # area or team members using same job names. This method is not entirely # foolproof since two job submissions with same name can happen at exactly # the same time. However the window is extremely small given that # time.time() has at least microseconds granularity. We add the suffix only # for GCS staging locations where the potential for such clashes is high. if self.google_cloud_options.staging_location.startswith('gs://'): path_suffix = '%s.%f' % (self.google_cloud_options.job_name, time.time()) self.google_cloud_options.staging_location = FileSystems.join( self.google_cloud_options.staging_location, path_suffix) self.google_cloud_options.temp_location = FileSystems.join( self.google_cloud_options.temp_location, path_suffix) self.proto = dataflow.Job(name=self.google_cloud_options.job_name) if self.options.view_as(StandardOptions).streaming: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING else: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH # Labels. if self.google_cloud_options.labels: self.proto.labels = dataflow.Job.LabelsValue() for label in self.google_cloud_options.labels: parts = label.split('=', 1) key = parts[0] value = parts[1] if len(parts) > 1 else '' self.proto.labels.additionalProperties.append( dataflow.Job.LabelsValue.AdditionalProperty(key=key, value=value)) self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$') self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
def test_create_job_returns_existing_job(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', ]) job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) self.assertTrue(job.proto.clientRequestId) # asserts non-empty string pipeline_options.view_as(GoogleCloudOptions).no_auth = True client = apiclient.DataflowApplicationClient(pipeline_options) response = dataflow.Job() # different clientRequestId from `job` response.clientRequestId = "20210821081910123456-1234" response.name = 'test_job_name' response.id = '2021-08-19_21_18_43-9756917246311111021' with mock.patch.object(client._client.projects_locations_jobs, 'Create', side_effect=[response]): with mock.patch.object(client, 'create_job_description', side_effect=None): with self.assertRaises( apiclient.DataflowJobAlreadyExistsError) as context: client.create_job(job) self.assertEqual( str(context.exception), 'There is already active job named %s with id: %s. If you want to ' 'submit a second job, try again by setting a different name using ' '--job_name.' % ('test_job_name', response.id))
def test_update_job_returns_existing_job(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--region', 'us-central1', '--update', ]) replace_job_id = '2021-08-21_00_00_01-6081497447916622336' with mock.patch('apache_beam.runners.dataflow.internal.apiclient.Job.' 'job_id_for_name', return_value=replace_job_id) as job_id_for_name_mock: job = apiclient.Job(pipeline_options, FAKE_PIPELINE_URL) job_id_for_name_mock.assert_called_once() self.assertTrue(job.proto.clientRequestId) # asserts non-empty string pipeline_options.view_as(GoogleCloudOptions).no_auth = True client = apiclient.DataflowApplicationClient(pipeline_options) response = dataflow.Job() # different clientRequestId from `job` response.clientRequestId = "20210821083254123456-1234" response.name = 'test_job_name' response.id = '2021-08-19_21_29_07-5725551945600207770' with mock.patch.object(client, 'create_job_description', side_effect=None): with mock.patch.object(client._client.projects_locations_jobs, 'Create', side_effect=[response]): with self.assertRaises( apiclient.DataflowJobAlreadyExistsError) as context: client.create_job(job) self.assertEqual( str(context.exception), 'The job named %s with id: %s has already been updated into job ' 'id: %s and cannot be updated again.' % ('test_job_name', replace_job_id, response.id))