def get_work_items(response, env=maptask.WorkerEnvironment(), context=maptask.ExecutionContext()): """Parses a lease work item response into a list of Worker* objects. The response is received by the worker as a result of a LeaseWorkItem request to the Dataflow service. Args: response: A LeaseWorkItemResponse protobuf object returned by the service. env: An environment object with worker configuration. context: A maptask.ExecutionContext object providing context for operations to be executed. Returns: A tuple of work item id and the list of Worker* objects (see definitions above) representing the list of operations to be executed as part of the work item. """ # Check if the request for work did not return anything. if not response.workItems: return None # For now service always sends one work item only. assert len(response.workItems) == 1 work_item = response.workItems[0] map_task = maptask.decode_map_task(work_item.mapTask, env, context) return BatchWorkItem(work_item, map_task)
def process_work_item(self, computation_id, map_task_proto, input_data_watermark, work_item): """Process a work item.""" workitem_commit_request = windmill_pb2.WorkItemCommitRequest( key=work_item.key, work_token=work_item.work_token) env = maptask.WorkerEnvironment() context = maptask.StreamingExecutionContext() reader = windmillstate.WindmillStateReader(computation_id, work_item.key, work_item.work_token, self.windmill) state_internals = windmillstate.WindmillStateInternals(reader) state = windmillstate.WindmillUnmergedState(state_internals) output_data_watermark = windmillio.windmill_to_harness_timestamp( work_item.output_data_watermark) context.start(computation_id, work_item, input_data_watermark, output_data_watermark, workitem_commit_request, self.windmill, state) map_task = maptask.decode_map_task(map_task_proto, env, context) map_task_executor = executor.MapTaskExecutor(map_task) map_task_executor.execute() state_internals.persist_to(workitem_commit_request) # Send result to Windmill. # TODO(ccy): in the future, this will not be done serially with respect to # work execution. commit_request = windmill_pb2.CommitWorkRequest() computation_commit_request = windmill_pb2.ComputationCommitWorkRequest( computation_id=computation_id, requests=[workitem_commit_request]) commit_request.requests.extend([computation_commit_request]) self.windmill.CommitWork(commit_request)
def get_work_items(response, env=maptask.WorkerEnvironment(), context=maptask.ExecutionContext()): """Parses a lease work item response into a list of Worker* objects. The response is received by the worker as a result of a LeaseWorkItem request to the Dataflow service. Args: response: A LeaseWorkItemResponse protobuf object returned by the service. env: An environment object with worker configuration. context: A maptask.ExecutionContext object providing context for operations to be executed. Returns: A tuple of work item id and the list of Worker* objects (see definitions above) representing the list of operations to be executed as part of the work item. Raises: ValueError: if type of WorkItem cannot be determined. """ # Check if the request for work did not return anything. if not response.workItems: return None # For now service always sends one work item only. assert len(response.workItems) == 1 work_item_proto = response.workItems[0] work_item = BatchWorkItem(work_item_proto) if work_item_proto.mapTask is not None: map_task = maptask.decode_map_task(work_item_proto.mapTask, env, context) work_item.map_task = map_task elif (work_item_proto.sourceOperationTask and work_item_proto.sourceOperationTask.split): source_operation_split_task = workercustomsources.SourceOperationSplitTask( work_item_proto.sourceOperationTask.split) work_item.source_operation_split_task = source_operation_split_task else: raise ValueError('Unknown type of work item: %s', work_item_proto) return work_item
def __init__(self, properties, sdk_pipeline_options): """Initializes a worker object from command line arguments.""" self.project_id = properties['project_id'] self.job_id = properties['job_id'] self.worker_id = properties['worker_id'] self.service_path = properties['service_path'] # TODO(silviuc): Make sure environment_info_path is always specified. self.environment_info_path = properties.get('environment_info_path', None) self.pipeline_options = options.PipelineOptions.from_dictionary( sdk_pipeline_options) self.capabilities = [self.worker_id, 'remote_source', 'custom_source'] self.work_types = ['map_task', 'seq_map_task', 'remote_source_task'] # The following properties are passed to the worker when its container # gets started and are not used right now. self.root_url = properties['root_url'] self.reporting_enabled = properties['reporting_enabled'] self.temp_gcs_directory = properties['temp_gcs_directory'] # Detect if the worker is running in a GCE VM. self.running_in_gce = self.temp_gcs_directory.startswith('gs://') # When running in a GCE VM the local_staging_property is always set. # For non-VM scenarios (integration tests) the local_staging_directory will # default to the temp directory. self.local_staging_directory = (properties['local_staging_directory'] if self.running_in_gce else self.temp_gcs_directory) self.client = apiclient.DataflowWorkerClient( worker=self, skip_get_credentials=(not self.running_in_gce)) self.environment = maptask.WorkerEnvironment() # If 'True' each work item will be profiled with cProfile. Results will # be logged and also saved to profile_location if set. self.work_item_profiling = sdk_pipeline_options.get('profile', False) self.profile_location = sdk_pipeline_options.get( 'profile_location', None) self._shutdown = False