Exemple #1
0
    def __init__(self, control_address, worker_count, credentials=None):
        self._worker_count = worker_count
        self._worker_index = 0
        if credentials is None:
            logging.info('Creating insecure control channel.')
            self._control_channel = grpc.insecure_channel(control_address)
        else:
            logging.info('Creating secure control channel.')
            self._control_channel = grpc.secure_channel(
                control_address, credentials)
        grpc.channel_ready_future(self._control_channel).result(timeout=60)
        logging.info('Control channel established.')

        self._control_channel = grpc.intercept_channel(self._control_channel,
                                                       WorkerIdInterceptor())
        self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
            credentials)
        self._state_handler_factory = GrpcStateHandlerFactory()
        self.workers = queue.Queue()
        # one thread is enough for getting the progress report.
        # Assumption:
        # Progress report generation should not do IO or wait on other resources.
        #  Without wait, having multiple threads will not improve performance and
        #  will only add complexity.
        self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
        self._process_thread_pool = futures.ThreadPoolExecutor(
            max_workers=self._worker_count)
        self._instruction_id_vs_worker = {}
        self._fns = {}
        self._responses = queue.Queue()
        self._process_bundle_queue = queue.Queue()
        self._unscheduled_process_bundle = set()
        logging.info('Initializing SDKHarness with %s workers.',
                     self._worker_count)
Exemple #2
0
 def __init__(self, control_address):
     self._control_channel = grpc.insecure_channel(control_address)
     self._data_channel_factory = data_plane.GrpcClientDataChannelFactory()
     # TODO: Ensure thread safety to run with more than 1 thread.
     self._default_work_thread_pool = futures.ThreadPoolExecutor(
         max_workers=1)
     self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
    def test_source_split(self):
        source = RangeSource(0, 100)
        expected_splits = list(source.split(30))

        worker = sdk_harness.SdkWorker(
            None, data_plane.GrpcClientDataChannelFactory())
        worker.register(
            beam_fn_api_pb2.RegisterRequest(process_bundle_descriptor=[
                beam_fn_api_pb2.ProcessBundleDescriptor(primitive_transform=[
                    beam_fn_api_pb2.PrimitiveTransform(
                        function_spec=sdk_harness.serialize_and_pack_py_fn(
                            SourceBundle(1.0, source, None, None),
                            sdk_harness.PYTHON_SOURCE_URN,
                            id="src"))
                ])
            ]))
        split_response = worker.initial_source_split(
            beam_fn_api_pb2.InitialSourceSplitRequest(
                desired_bundle_size_bytes=30, source_reference="src"))

        self.assertEqual(expected_splits, [
            sdk_harness.unpack_and_deserialize_py_fn(s.source)
            for s in split_response.splits
        ])

        self.assertEqual([s.weight for s in expected_splits],
                         [s.relative_size for s in split_response.splits])
Exemple #4
0
    def __init__(
            self,
            control_address,
            worker_count,
            credentials=None,
            worker_id=None,
            # Caching is disabled by default
            state_cache_size=0,
            profiler_factory=None):
        self._alive = True
        self._worker_count = worker_count
        self._worker_index = 0
        self._worker_id = worker_id
        self._state_cache = StateCache(state_cache_size)
        if credentials is None:
            logging.info('Creating insecure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.insecure_channel(
                control_address)
        else:
            logging.info('Creating secure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.secure_channel(
                control_address, credentials)
        grpc.channel_ready_future(self._control_channel).result(timeout=60)
        logging.info('Control channel established.')

        self._control_channel = grpc.intercept_channel(
            self._control_channel, WorkerIdInterceptor(self._worker_id))
        self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
            credentials, self._worker_id)
        self._state_handler_factory = GrpcStateHandlerFactory(
            self._state_cache, credentials)
        self._profiler_factory = profiler_factory
        self._fns = {}
        # BundleProcessor cache across all workers.
        self._bundle_processor_cache = BundleProcessorCache(
            state_handler_factory=self._state_handler_factory,
            data_channel_factory=self._data_channel_factory,
            fns=self._fns)
        # workers for process/finalize bundle.
        self.workers = queue.Queue()
        # one worker for progress/split request.
        self.progress_worker = SdkWorker(
            self._bundle_processor_cache,
            profiler_factory=self._profiler_factory)
        # one thread is enough for getting the progress report.
        # Assumption:
        # Progress report generation should not do IO or wait on other resources.
        #  Without wait, having multiple threads will not improve performance and
        #  will only add complexity.
        self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
        # finalize and process share one thread pool.
        self._process_thread_pool = futures.ThreadPoolExecutor(
            max_workers=self._worker_count)
        self._responses = queue.Queue()
        self._process_bundle_queue = queue.Queue()
        self._unscheduled_process_bundle = {}
        logging.info('Initializing SDKHarness with %s workers.',
                     self._worker_count)
  def __init__(self,
               control_address,  # type: str
               credentials=None,
               worker_id=None,  # type: Optional[str]
               # Caching is disabled by default
               state_cache_size=0,
               # time-based data buffering is disabled by default
               data_buffer_time_limit_ms=0,
               profiler_factory=None,  # type: Optional[Callable[..., Profile]]
               status_address=None,  # type: Optional[str, unicode]
               ):
    self._alive = True
    self._worker_index = 0
    self._worker_id = worker_id
    self._state_cache = StateCache(state_cache_size)
    if credentials is None:
      _LOGGER.info('Creating insecure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.insecure_channel(
          control_address)
    else:
      _LOGGER.info('Creating secure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.secure_channel(
          control_address, credentials)
    grpc.channel_ready_future(self._control_channel).result(timeout=60)
    _LOGGER.info('Control channel established.')

    self._control_channel = grpc.intercept_channel(
        self._control_channel, WorkerIdInterceptor(self._worker_id))
    self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
        credentials, self._worker_id, data_buffer_time_limit_ms)
    self._state_handler_factory = GrpcStateHandlerFactory(
        self._state_cache, credentials)
    self._profiler_factory = profiler_factory
    self._fns = {}  # type: Dict[str, beam_fn_api_pb2.ProcessBundleDescriptor]
    # BundleProcessor cache across all workers.
    self._bundle_processor_cache = BundleProcessorCache(
        state_handler_factory=self._state_handler_factory,
        data_channel_factory=self._data_channel_factory,
        fns=self._fns)

    if status_address:
      try:
        self._status_handler = FnApiWorkerStatusHandler(
            status_address, self._bundle_processor_cache)
      except Exception:
        traceback_string = traceback.format_exc()
        _LOGGER.warning(
            'Error creating worker status request handler, '
            'skipping status report. Trace back: %s' % traceback_string)
    else:
      self._status_handler = None

    # TODO(BEAM-8998) use common UnboundedThreadPoolExecutor to process bundle
    #  progress once dataflow runner's excessive progress polling is removed.
    self._report_progress_executor = futures.ThreadPoolExecutor(max_workers=1)
    self._worker_thread_pool = UnboundedThreadPoolExecutor()
    self._responses = queue.Queue(
    )  # type: queue.Queue[beam_fn_api_pb2.InstructionResponse]
    _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
Exemple #6
0
    def __init__(
            self,
            control_address,
            credentials=None,
            worker_id=None,
            # Caching is disabled by default
            state_cache_size=0,
            profiler_factory=None):
        self._alive = True
        self._worker_index = 0
        self._worker_id = worker_id
        self._state_cache = StateCache(state_cache_size)
        if credentials is None:
            _LOGGER.info('Creating insecure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.insecure_channel(
                control_address)
        else:
            _LOGGER.info('Creating secure control channel for %s.',
                         control_address)
            self._control_channel = GRPCChannelFactory.secure_channel(
                control_address, credentials)
        grpc.channel_ready_future(self._control_channel).result(timeout=60)
        _LOGGER.info('Control channel established.')

        self._control_channel = grpc.intercept_channel(
            self._control_channel, WorkerIdInterceptor(self._worker_id))
        self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
            credentials, self._worker_id)
        self._state_handler_factory = GrpcStateHandlerFactory(
            self._state_cache, credentials)
        self._profiler_factory = profiler_factory
        self._fns = {}
        # BundleProcessor cache across all workers.
        self._bundle_processor_cache = BundleProcessorCache(
            state_handler_factory=self._state_handler_factory,
            data_channel_factory=self._data_channel_factory,
            fns=self._fns)

        # TODO(BEAM-8998) use common UnboundedThreadPoolExecutor to process bundle
        #  progress once dataflow runner's excessive progress polling is removed.
        self._report_progress_executor = futures.ThreadPoolExecutor(
            max_workers=1)
        self._worker_thread_pool = UnboundedThreadPoolExecutor()
        self._responses = queue.Queue()
        _LOGGER.info(
            'Initializing SDKHarness with unbounded number of workers.')
Exemple #7
0
 def __init__(self, control_address, worker_count):
   self._worker_count = worker_count
   self._worker_index = 0
   self._control_channel = grpc.insecure_channel(control_address)
   self._data_channel_factory = data_plane.GrpcClientDataChannelFactory()
   self.workers = queue.Queue()
   # one thread is enough for getting the progress report.
   # Assumption:
   # Progress report generation should not do IO or wait on other resources.
   #  Without wait, having multiple threads will not improve performance and
   #  will only add complexity.
   self._progress_thread_pool = futures.ThreadPoolExecutor(max_workers=1)
   self._process_thread_pool = futures.ThreadPoolExecutor(
       max_workers=self._worker_count)
   self._instruction_id_vs_worker = {}
   self._fns = {}
   self._responses = queue.Queue()
   self._process_bundle_queue = queue.Queue()
   logging.info('Initializing SDKHarness with %s workers.', self._worker_count)
Exemple #8
0
  def __init__(self,
               control_address,  # type: str
               credentials=None,
               worker_id=None,  # type: Optional[str]
               # Caching is disabled by default
               state_cache_size=0,
               profiler_factory=None  # type: Optional[Callable[..., Profile]]
              ):
    self._alive = True
    self._worker_index = 0
    self._worker_id = worker_id
    self._state_cache = StateCache(state_cache_size)
    if credentials is None:
      _LOGGER.info('Creating insecure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.insecure_channel(
          control_address)
    else:
      _LOGGER.info('Creating secure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.secure_channel(
          control_address, credentials)
    grpc.channel_ready_future(self._control_channel).result(timeout=60)
    _LOGGER.info('Control channel established.')

    self._control_channel = grpc.intercept_channel(
        self._control_channel, WorkerIdInterceptor(self._worker_id))
    self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
        credentials, self._worker_id)
    self._state_handler_factory = GrpcStateHandlerFactory(self._state_cache,
                                                          credentials)
    self._profiler_factory = profiler_factory
    self._fns = {}  # type: Dict[str, beam_fn_api_pb2.ProcessBundleDescriptor]
    # BundleProcessor cache across all workers.
    self._bundle_processor_cache = BundleProcessorCache(
        state_handler_factory=self._state_handler_factory,
        data_channel_factory=self._data_channel_factory,
        fns=self._fns)
    self._worker_thread_pool = UnboundedThreadPoolExecutor()
    self._responses = queue.Queue()  # type: queue.Queue[beam_fn_api_pb2.InstructionResponse]
    _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
Exemple #9
0
  def __init__(
      self,
      control_address,  # type: str
      credentials=None,  # type: Optional[grpc.ChannelCredentials]
      worker_id=None,  # type: Optional[str]
      # Caching is disabled by default
      state_cache_size=0,  # type: int
      # time-based data buffering is disabled by default
      data_buffer_time_limit_ms=0,  # type: int
      profiler_factory=None,  # type: Optional[Callable[..., Profile]]
      status_address=None,  # type: Optional[str]
      # Heap dump through status api is disabled by default
      enable_heap_dump=False,  # type: bool
  ):
    # type: (...) -> None
    self._alive = True
    self._worker_index = 0
    self._worker_id = worker_id
    self._state_cache = StateCache(state_cache_size)
    options = [('grpc.max_receive_message_length', -1),
               ('grpc.max_send_message_length', -1)]
    if credentials is None:
      _LOGGER.info('Creating insecure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.insecure_channel(
          control_address, options=options)
    else:
      _LOGGER.info('Creating secure control channel for %s.', control_address)
      self._control_channel = GRPCChannelFactory.secure_channel(
          control_address, credentials, options=options)
    grpc.channel_ready_future(self._control_channel).result(timeout=60)
    _LOGGER.info('Control channel established.')

    self._control_channel = grpc.intercept_channel(
        self._control_channel, WorkerIdInterceptor(self._worker_id))
    self._data_channel_factory = data_plane.GrpcClientDataChannelFactory(
        credentials, self._worker_id, data_buffer_time_limit_ms)
    self._state_handler_factory = GrpcStateHandlerFactory(
        self._state_cache, credentials)
    self._profiler_factory = profiler_factory

    def default_factory(id):
      # type: (str) -> beam_fn_api_pb2.ProcessBundleDescriptor
      return self._control_stub.GetProcessBundleDescriptor(
          beam_fn_api_pb2.GetProcessBundleDescriptorRequest(
              process_bundle_descriptor_id=id))

    self._fns = KeyedDefaultDict(default_factory)
    # BundleProcessor cache across all workers.
    self._bundle_processor_cache = BundleProcessorCache(
        state_handler_factory=self._state_handler_factory,
        data_channel_factory=self._data_channel_factory,
        fns=self._fns)

    if status_address:
      try:
        self._status_handler = FnApiWorkerStatusHandler(
            status_address, self._bundle_processor_cache,
            enable_heap_dump)  # type: Optional[FnApiWorkerStatusHandler]
      except Exception:
        traceback_string = traceback.format_exc()
        _LOGGER.warning(
            'Error creating worker status request handler, '
            'skipping status report. Trace back: %s' % traceback_string)
    else:
      self._status_handler = None

    # TODO(BEAM-8998) use common
    # thread_pool_executor.shared_unbounded_instance() to process bundle
    # progress once dataflow runner's excessive progress polling is removed.
    self._report_progress_executor = futures.ThreadPoolExecutor(max_workers=1)
    self._worker_thread_pool = thread_pool_executor.shared_unbounded_instance()
    self._responses = queue.Queue(
    )  # type: queue.Queue[Union[beam_fn_api_pb2.InstructionResponse, Sentinel]]
    _LOGGER.info('Initializing SDKHarness with unbounded number of workers.')
 def __init__(self, control_address):
     self._control_channel = grpc.insecure_channel(control_address)
     self._data_channel_factory = data_plane.GrpcClientDataChannelFactory()
 def __init__(self, control_channel):
     self._control_channel = control_channel
     self._data_channel_factory = data_plane.GrpcClientDataChannelFactory()