Ejemplo n.º 1
0
    def test_full_completion(self):
        # Create dummy file and close it.  Note that we need to do this because
        # Windows does not allow NamedTemporaryFiles to be reopened elsewhere
        # before the temporary file is closed.
        dummy_file = tempfile.NamedTemporaryFile(delete=False)
        dummy_file_name = dummy_file.name
        dummy_file.close()

        dummy_dir = tempfile.mkdtemp()

        remote_runner = DataflowRunner()
        pipeline = Pipeline(
            remote_runner,
            options=PipelineOptions([
                '--dataflow_endpoint=ignored',
                '--sdk_location=' + dummy_file_name, '--job_name=test-job',
                '--project=test-project', '--staging_location=' + dummy_dir,
                '--temp_location=/dev/null',
                '--template_location=' + dummy_file_name, '--no_auth=True'
            ]))

        pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x)  # pylint: disable=expression-not-assigned
        pipeline.run().wait_until_finish()
        with open(dummy_file_name) as template_file:
            saved_job_dict = json.load(template_file)
            self.assertEqual(
                saved_job_dict['environment']['sdkPipelineOptions']['options']
                ['project'], 'test-project')
            self.assertEqual(
                saved_job_dict['environment']['sdkPipelineOptions']['options']
                ['job_name'], 'test-job')
Ejemplo n.º 2
0
    def test_bad_path(self):
        dummy_sdk_file = tempfile.NamedTemporaryFile()
        remote_runner = DataflowRunner()
        pipeline = Pipeline(
            remote_runner,
            options=PipelineOptions([
                '--dataflow_endpoint=ignored',
                '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job',
                '--project=test-project', '--staging_location=ignored',
                '--temp_location=/dev/null', '--template_location=/bad/path',
                '--no_auth=True'
            ]))
        remote_runner.job = apiclient.Job(pipeline._options)

        with self.assertRaises(IOError):
            pipeline.run().wait_until_finish()
  def test_bad_path(self):
    dummy_sdk_file = tempfile.NamedTemporaryFile()
    remote_runner = DataflowRunner()
    pipeline = Pipeline(remote_runner,
                        options=PipelineOptions([
                            '--dataflow_endpoint=ignored',
                            '--sdk_location=' + dummy_sdk_file.name,
                            '--job_name=test-job',
                            '--project=test-project',
                            '--staging_location=ignored',
                            '--temp_location=/dev/null',
                            '--template_location=/bad/path',
                            '--no_auth=True']))
    remote_runner.job = apiclient.Job(pipeline._options)

    with self.assertRaises(IOError):
      pipeline.run().wait_until_finish()
Ejemplo n.º 4
0
    def run_pipeline(
            self,
            pipeline,  # type: Pipeline
            options  # type: pipeline_options.PipelineOptions
    ):
        # type: (...) -> RunnerResult
        RuntimeValueProvider.set_runtime_options({})

        # Setup "beam_fn_api" experiment options if lacked.
        experiments = (options.view_as(
            pipeline_options.DebugOptions).experiments or [])
        if not 'beam_fn_api' in experiments:
            experiments.append('beam_fn_api')
        options.view_as(
            pipeline_options.DebugOptions).experiments = experiments

        # This is sometimes needed if type checking is disabled
        # to enforce that the inputs (and outputs) of GroupByKey operations
        # are known to be KVs.
        from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
        # TODO: Move group_by_key_input_visitor() to a non-dataflow specific file.
        pipeline.visit(
            DataflowRunner.group_by_key_input_visitor(
                not options.view_as(pipeline_options.TypeOptions
                                    ).allow_non_deterministic_key_coders))
        self._bundle_repeat = self._bundle_repeat or options.view_as(
            pipeline_options.DirectOptions).direct_runner_bundle_repeat
        pipeline_direct_num_workers = options.view_as(
            pipeline_options.DirectOptions).direct_num_workers
        if pipeline_direct_num_workers == 0:
            self._num_workers = multiprocessing.cpu_count()
        else:
            self._num_workers = pipeline_direct_num_workers or self._num_workers

        # set direct workers running mode if it is defined with pipeline options.
        running_mode = \
          options.view_as(pipeline_options.DirectOptions).direct_running_mode
        if running_mode == 'multi_threading':
            self._default_environment = environments.EmbeddedPythonGrpcEnvironment(
            )
        elif running_mode == 'multi_processing':
            command_string = '%s -m apache_beam.runners.worker.sdk_worker_main' \
                          % sys.executable
            self._default_environment = environments.SubprocessSDKEnvironment(
                command_string=command_string)

        self._profiler_factory = Profile.factory_from_options(
            options.view_as(pipeline_options.ProfilingOptions))

        self._latest_run_result = self.run_via_runner_api(
            pipeline.to_runner_api(
                default_environment=self._default_environment))
        return self._latest_run_result
Ejemplo n.º 5
0
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
Ejemplo n.º 6
0
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
Ejemplo n.º 7
0
def create_streaming_job(
    service_name,
    service_id,
    project_name,
    region,
    image_uri,
    setup_file_path=DEFAULT_SETUP_FILE_PATH,
    temporary_files_location=DEFAULT_DATAFLOW_TEMPORARY_FILES_LOCATION,
    service_account_email=None,
    worker_machine_type=None,
    maximum_instances=None,
    update=False,
    extra_options=None,
):
    """Deploy an `octue` service as a streaming Google Dataflow Prime job.

    :param str service_name: the name to give the Dataflow job
    :param str service_id: the Pub/Sub topic name for the Dataflow job to subscribe to
    :param str project_name: the name of the project to deploy the job to
    :param str region: the region to deploy the job to
    :param str image_uri: the URI of the `apache-beam`-based Docker image to use for the job
    :param str setup_file_path: path to the python `setup.py` file to use for the job
    :param str temporary_files_location: a Google Cloud Storage path to save temporary files from the job at
    :param str|None service_account_email: the email of the service account to run the Dataflow VMs as
    :param str|None worker_machine_type: the machine type to create Dataflow worker VMs as. See https://cloud.google.com/compute/docs/machine-types for a list of valid options. If not set, the Dataflow service will choose a reasonable default.
    :param int|None maximum_instances: the maximum number of workers to use when executing the Dataflow job
    :param bool update: if `True`, update the existing job with the same name
    :param dict|None extra_options: any further arguments to be passed to Apache Beam as pipeline options
    :raise DeploymentError: if a Dataflow job with the service name already exists
    :return None:
    """
    pipeline_options = {
        "project": project_name,
        "region": region,
        "temp_location": temporary_files_location,
        "job_name": service_name,
        "sdk_container_image": image_uri,
        "setup_file": os.path.abspath(setup_file_path),
        "update": update,
        "streaming": True,
        **(extra_options or {}),
    }

    if service_account_email:
        pipeline_options["service_account_email"] = service_account_email

    if worker_machine_type:
        pipeline_options["worker_machine_type"] = worker_machine_type
    else:
        # Dataflow Prime can only be used if a worker machine type is not specified.
        pipeline_options["dataflow_service_options"] = ["enable_prime"]

    if maximum_instances:
        pipeline_options["max_num_workers"] = maximum_instances

    pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
    pipeline = apache_beam.Pipeline(options=pipeline_options)

    service_topic = Topic(
        name=service_id,
        namespace=OCTUE_NAMESPACE,
        service=Service(backend=GCPPubSubBackend(project_name=project_name)),
    )

    service_topic.create(allow_existing=True)

    (
        pipeline
        | "Read from Pub/Sub" >> apache_beam.io.ReadFromPubSub(topic=service_topic.path, with_attributes=True)
        | "Answer question" >> apache_beam.Map(lambda question: answer_question(question, project_name=project_name))
    )

    try:
        DataflowRunner().run_pipeline(pipeline, options=pipeline_options)
    except DataflowJobAlreadyExistsError:
        raise DeploymentError(f"A Dataflow job with name {service_name!r} already exists.") from None