Example #1
0
                os.environ['PYFLINK_LOOPBACK_SERVER_ADDRESS']) as channel:
            client = BeamFnExternalWorkerPoolStub(channel=channel)
            request = StartWorkerRequest(
                worker_id=worker_id,
                provision_endpoint=ApiServiceDescriptor(
                    url=provision_endpoint),
                params=params)
            client.StartWorker(request)
    else:
        logging.info("Starting up Python harness in a standalone process.")
        metadata = [("worker_id", worker_id)]

        # read job information from provision stub
        with grpc.insecure_channel(provision_endpoint) as channel:
            client = ProvisionServiceStub(channel=channel)
            info = client.GetProvisionInfo(GetProvisionInfoRequest(),
                                           metadata=metadata).info
            options = json_format.MessageToJson(info.pipeline_options)
            logging_endpoint = info.logging_endpoint.url
            control_endpoint = info.control_endpoint.url

        os.environ["WORKER_ID"] = worker_id
        os.environ["PIPELINE_OPTIONS"] = options
        os.environ["SEMI_PERSISTENT_DIRECTORY"] = semi_persist_dir
        os.environ[
            "LOGGING_API_SERVICE_DESCRIPTOR"] = text_format.MessageToString(
                ApiServiceDescriptor(url=logging_endpoint))
        os.environ[
            "CONTROL_API_SERVICE_DESCRIPTOR"] = text_format.MessageToString(
                ApiServiceDescriptor(url=control_endpoint))
    def _start_sdk_worker_main(
            self, start_worker_request: beam_fn_api_pb2.StartWorkerRequest):
        params = start_worker_request.params
        self._parse_param_lock.acquire()
        # The first thread to start is responsible for preparing all execution environment.
        if not self._ref_cnt:
            if 'PYTHONPATH' in params:
                self._old_python_path = sys.path[:]
                python_path_list = params['PYTHONPATH'].split(':')
                python_path_list.reverse()
                for path in python_path_list:
                    sys.path.insert(0, path)
            if '_PYTHON_WORKING_DIR' in params:
                self._old_working_dir = os.getcwd()
                os.chdir(params['_PYTHON_WORKING_DIR'])
            os.environ.update(params)
        self._ref_cnt += 1
        self._parse_param_lock.release()

        # read job information from provision stub
        metadata = [("worker_id", start_worker_request.worker_id)]
        provision_endpoint = start_worker_request.provision_endpoint.url
        with grpc.insecure_channel(provision_endpoint) as channel:
            client = ProvisionServiceStub(channel=channel)
            info = client.GetProvisionInfo(GetProvisionInfoRequest(),
                                           metadata=metadata).info
            options = json_format.MessageToJson(info.pipeline_options)
            logging_endpoint = info.logging_endpoint.url
            control_endpoint = info.control_endpoint.url

        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=logging_endpoint)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            logging.getLogger().setLevel(logging.INFO)
            # Remove all the built-in log handles
            logging.getLogger().handlers = []
            logging.getLogger().addHandler(fn_log_handler)
            logging.info("Starting up Python worker in loopback mode.")
        except Exception:
            _LOGGER.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None

        sdk_pipeline_options = sdk_worker_main._parse_pipeline_options(options)

        _worker_id = start_worker_request.worker_id

        try:
            control_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=control_endpoint)
            status_service_descriptor = endpoints_pb2.ApiServiceDescriptor()

            experiments = sdk_pipeline_options.view_as(
                DebugOptions).experiments or []
            enable_heap_dump = 'enable_heap_dump' in experiments
            SdkHarness(control_address=control_service_descriptor.url,
                       status_address=status_service_descriptor.url,
                       worker_id=_worker_id,
                       state_cache_size=sdk_worker_main._get_state_cache_size(
                           experiments),
                       data_buffer_time_limit_ms=sdk_worker_main.
                       _get_data_buffer_time_limit_ms(experiments),
                       profiler_factory=profiler.Profile.factory_from_options(
                           sdk_pipeline_options.view_as(ProfilingOptions)),
                       enable_heap_dump=enable_heap_dump).run()
        except:  # pylint: disable=broad-except
            _LOGGER.exception('Python sdk harness failed: ')
            raise
        finally:
            self._parse_param_lock.acquire()
            self._ref_cnt -= 1
            # The last thread to exit is responsible for reverting working directory and sys.path.
            if self._ref_cnt == 0:
                if self._old_python_path is not None:
                    sys.path.clear()
                    for item in self._old_python_path:
                        sys.path.append(item)
                    self._old_python_path = None
                if self._old_working_dir is not None:
                    os.chdir(self._old_working_dir)
                    self._old_working_dir = None
            self._parse_param_lock.release()
            if fn_log_handler:
                fn_log_handler.close()
Example #3
0
    semi_persist_dir = args.semi_persist_dir

    check_not_empty(worker_id, "No id provided.")
    check_not_empty(logging_endpoint, "No logging endpoint provided.")
    check_not_empty(artifact_endpoint, "No artifact endpoint provided.")
    check_not_empty(provision_endpoint, "No provision endpoint provided.")
    check_not_empty(control_endpoint, "No control endpoint provided.")

    logging.info("Initializing python harness: %s" % " ".join(sys.argv))

    metadata = [("worker_id", worker_id)]

    # read job information from provision stub
    with grpc.insecure_channel(provision_endpoint) as channel:
        client = ProvisionServiceStub(channel=channel)
        info = client.GetProvisionInfo(GetProvisionInfoRequest(), metadata=metadata).info
        options = json_format.MessageToJson(info.pipeline_options)

    staged_dir = os.path.join(semi_persist_dir, "staged")

    # download files
    with grpc.insecure_channel(artifact_endpoint) as channel:
        client = ArtifactRetrievalServiceStub(channel=channel)
        # get file list via retrieval token
        response = client.GetManifest(GetManifestRequest(retrieval_token=info.retrieval_token),
                                      metadata=metadata)
        artifacts = response.manifest.artifact
        # download files and check hash values
        for artifact in artifacts:
            name = artifact.name
            permissions = artifact.permissions
Example #4
0
    def _start_sdk_worker_main(
            self, start_worker_request: beam_fn_api_pb2.StartWorkerRequest):
        params = start_worker_request.params
        self._parse_param_lock.acquire()
        if 'PYTHONPATH' in params:
            python_path_list = params['PYTHONPATH'].split(':')
            python_path_list.reverse()
            for path in python_path_list:
                sys.path.insert(0, path)
        if '_PYTHON_WORKING_DIR' in params:
            os.chdir(params['_PYTHON_WORKING_DIR'])
        os.environ.update(params)
        self._parse_param_lock.release()

        # read job information from provision stub
        metadata = [("worker_id", start_worker_request.worker_id)]
        provision_endpoint = start_worker_request.provision_endpoint.url
        with grpc.insecure_channel(provision_endpoint) as channel:
            client = ProvisionServiceStub(channel=channel)
            info = client.GetProvisionInfo(GetProvisionInfoRequest(),
                                           metadata=metadata).info
            options = json_format.MessageToJson(info.pipeline_options)
            logging_endpoint = info.logging_endpoint.url
            control_endpoint = info.control_endpoint.url

        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=logging_endpoint)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            logging.getLogger().setLevel(logging.ERROR)
            logging.getLogger().addHandler(fn_log_handler)
        except Exception:
            _LOGGER.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None

        sdk_pipeline_options = sdk_worker_main._parse_pipeline_options(options)

        _worker_id = start_worker_request.worker_id

        try:
            control_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=control_endpoint)
            status_service_descriptor = endpoints_pb2.ApiServiceDescriptor()

            experiments = sdk_pipeline_options.view_as(
                DebugOptions).experiments or []
            enable_heap_dump = 'enable_heap_dump' in experiments
            SdkHarness(control_address=control_service_descriptor.url,
                       status_address=status_service_descriptor.url,
                       worker_id=_worker_id,
                       state_cache_size=sdk_worker_main._get_state_cache_size(
                           experiments),
                       data_buffer_time_limit_ms=sdk_worker_main.
                       _get_data_buffer_time_limit_ms(experiments),
                       profiler_factory=profiler.Profile.factory_from_options(
                           sdk_pipeline_options.view_as(ProfilingOptions)),
                       enable_heap_dump=enable_heap_dump).run()
        except:  # pylint: disable=broad-except
            _LOGGER.exception('Python sdk harness failed: ')
            raise
        finally:
            if fn_log_handler:
                fn_log_handler.close()