コード例 #1
0
def get_work_items(response, env=maptask.WorkerEnvironment(),
                   context=maptask.ExecutionContext()):
  """Parses a lease work item response into a list of Worker* objects.

  The response is received by the worker as a result of a LeaseWorkItem
  request to the Dataflow service.

  Args:
    response: A LeaseWorkItemResponse protobuf object returned by the service.
    env: An environment object with worker configuration.
    context: A maptask.ExecutionContext object providing context for operations
             to be executed.

  Returns:
    A tuple of work item id and the list of Worker* objects (see definitions
    above) representing the list of operations to be executed as part of the
    work item.
  """
  # Check if the request for work did not return anything.
  if not response.workItems:
    return None
  # For now service always sends one work item only.
  assert len(response.workItems) == 1
  work_item = response.workItems[0]
  map_task = maptask.decode_map_task(work_item.mapTask, env, context)
  return BatchWorkItem(work_item, map_task)
コード例 #2
0
    def process_work_item(self, computation_id, map_task_proto,
                          input_data_watermark, work_item):
        """Process a work item."""
        workitem_commit_request = windmill_pb2.WorkItemCommitRequest(
            key=work_item.key, work_token=work_item.work_token)

        env = maptask.WorkerEnvironment()
        context = maptask.StreamingExecutionContext()

        reader = windmillstate.WindmillStateReader(computation_id,
                                                   work_item.key,
                                                   work_item.work_token,
                                                   self.windmill)
        state_internals = windmillstate.WindmillStateInternals(reader)
        state = windmillstate.WindmillUnmergedState(state_internals)
        output_data_watermark = windmillio.windmill_to_harness_timestamp(
            work_item.output_data_watermark)

        context.start(computation_id, work_item, input_data_watermark,
                      output_data_watermark, workitem_commit_request,
                      self.windmill, state)

        map_task = maptask.decode_map_task(map_task_proto, env, context)
        map_task_executor = executor.MapTaskExecutor(map_task)
        map_task_executor.execute()
        state_internals.persist_to(workitem_commit_request)

        # Send result to Windmill.
        # TODO(ccy): in the future, this will not be done serially with respect to
        # work execution.
        commit_request = windmill_pb2.CommitWorkRequest()
        computation_commit_request = windmill_pb2.ComputationCommitWorkRequest(
            computation_id=computation_id, requests=[workitem_commit_request])
        commit_request.requests.extend([computation_commit_request])
        self.windmill.CommitWork(commit_request)
コード例 #3
0
ファイル: workitem.py プロジェクト: volnt/DataflowPythonSDK
def get_work_items(response,
                   env=maptask.WorkerEnvironment(),
                   context=maptask.ExecutionContext()):
    """Parses a lease work item response into a list of Worker* objects.

  The response is received by the worker as a result of a LeaseWorkItem
  request to the Dataflow service.

  Args:
    response: A LeaseWorkItemResponse protobuf object returned by the service.
    env: An environment object with worker configuration.
    context: A maptask.ExecutionContext object providing context for operations
             to be executed.

  Returns:
    A tuple of work item id and the list of Worker* objects (see definitions
    above) representing the list of operations to be executed as part of the
    work item.

  Raises:
    ValueError: if type of WorkItem cannot be determined.
  """
    # Check if the request for work did not return anything.
    if not response.workItems:
        return None
    # For now service always sends one work item only.
    assert len(response.workItems) == 1
    work_item_proto = response.workItems[0]
    work_item = BatchWorkItem(work_item_proto)

    if work_item_proto.mapTask is not None:
        map_task = maptask.decode_map_task(work_item_proto.mapTask, env,
                                           context)
        work_item.map_task = map_task
    elif (work_item_proto.sourceOperationTask
          and work_item_proto.sourceOperationTask.split):
        source_operation_split_task = workercustomsources.SourceOperationSplitTask(
            work_item_proto.sourceOperationTask.split)
        work_item.source_operation_split_task = source_operation_split_task
    else:
        raise ValueError('Unknown type of work item: %s', work_item_proto)

    return work_item
コード例 #4
0
    def __init__(self, properties, sdk_pipeline_options):
        """Initializes a worker object from command line arguments."""
        self.project_id = properties['project_id']
        self.job_id = properties['job_id']
        self.worker_id = properties['worker_id']
        self.service_path = properties['service_path']
        # TODO(silviuc): Make sure environment_info_path is always specified.
        self.environment_info_path = properties.get('environment_info_path',
                                                    None)
        self.pipeline_options = options.PipelineOptions.from_dictionary(
            sdk_pipeline_options)
        self.capabilities = [self.worker_id, 'remote_source', 'custom_source']
        self.work_types = ['map_task', 'seq_map_task', 'remote_source_task']
        # The following properties are passed to the worker when its container
        # gets started and are not used right now.
        self.root_url = properties['root_url']
        self.reporting_enabled = properties['reporting_enabled']
        self.temp_gcs_directory = properties['temp_gcs_directory']
        # Detect if the worker is running in a GCE VM.
        self.running_in_gce = self.temp_gcs_directory.startswith('gs://')
        # When running in a GCE VM the local_staging_property is always set.
        # For non-VM scenarios (integration tests) the local_staging_directory will
        # default to the temp directory.
        self.local_staging_directory = (properties['local_staging_directory']
                                        if self.running_in_gce else
                                        self.temp_gcs_directory)

        self.client = apiclient.DataflowWorkerClient(
            worker=self, skip_get_credentials=(not self.running_in_gce))

        self.environment = maptask.WorkerEnvironment()

        # If 'True' each work item will be profiled with cProfile. Results will
        # be logged and also saved to profile_location if set.
        self.work_item_profiling = sdk_pipeline_options.get('profile', False)
        self.profile_location = sdk_pipeline_options.get(
            'profile_location', None)

        self._shutdown = False