Exemple #1
0
    def test_bad_path(self):
        dummy_sdk_file = tempfile.NamedTemporaryFile()
        remote_runner = DataflowRunner()
        pipeline = Pipeline(
            remote_runner,
            options=PipelineOptions([
                '--dataflow_endpoint=ignored',
                '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job',
                '--project=test-project', '--staging_location=ignored',
                '--temp_location=/dev/null', '--template_location=/bad/path',
                '--no_auth=True'
            ]))
        remote_runner.job = apiclient.Job(pipeline._options)

        with self.assertRaises(IOError):
            pipeline.run().wait_until_finish()
 def test_environment_override_translation(self):
   self.default_properties.append('--experiments=beam_fn_api')
   self.default_properties.append('--worker_harness_container_image=FOO')
   remote_runner = DataflowRunner()
   p = Pipeline(remote_runner,
                options=PipelineOptions(self.default_properties))
   (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
    | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
    | ptransform.GroupByKey())
   p.run()
   self.assertEqual(
       list(remote_runner.proto_pipeline.components.environments.values()),
       [beam_runner_api_pb2.Environment(
           urn=common_urns.environments.DOCKER.urn,
           payload=beam_runner_api_pb2.DockerPayload(
               container_image='FOO').SerializeToString())])
def run():         
    PROJECT_ID = 'corvid-276516'
    BUCKET = 'gs://covid-bucket19' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'format-codes--df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    
    sql = 'SELECT * FROM covid_staging.googleMobility ORDER BY date, country_region' # passing a query. Shouldn't process more than 1000 records w DR
   
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) # direct runner is not running in parallel on several workers. DR is local

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # read results and assign them to a new p-collection

     # call pardo, pipe query results to pardo
    format_alphaCode_pcoll = query_results | 'Change the country code for Greece, the UK, and Hong Kong. Drop Reunion' >> beam.ParDo(format_alphaCodeFn()) 

     # write PCollection to log file
    format_alphaCode_pcoll | 'Write log 1' >> WriteToText('geodist_beam.txt') 

    dataset_id = 'covid_modeled'
    table_id = 'mobility_beam'
    schema_id = 'code:STRING, country:STRING, date:DATE, average_change:INTEGER, retail_and_recreation:INTEGER, grocery_and_pharmacy:INTEGER, parks:INTEGER, transit_stations:INTEGER, workplaces:INTEGER,residential:INTEGER'

     # write PCollection to new BQ table
    format_alphaCode_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                table=table_id, 
                                                schema=schema_id, 
                                                project=PROJECT_ID,
                                                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 
                                                batch_size=int(100))
     
    result = p.run()
    result.wait_until_finish()      
def run():         
    PROJECT_ID = 'starry-center-266501' # change to your project id
    BUCKET = 'gs://imdb-beam' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'split-directors-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    
    sql = 'SELECT directors, tConst FROM imdb_modeled.Directs WHERE directors IN (SELECT DISTINCT nConst FROM imdb_modeled.People)'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write PCollection to log file
    query_results | 'Write log 1' >> WriteToText(DIR_PATH + 'query_results.txt')

    # apply ParDo to format the student's date of birth  
    split_directors_pcoll = query_results | 'Split up directors' >> beam.ParDo(SplitDirectorsFn())

    # write PCollection to log file
    split_directors_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'split_directors_pcoll.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Directs_Beam_DF'
    schema_id = 'director:STRING,tConst:STRING'

    # write PCollection to new BQ table
    split_directors_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()
  def test_visit_entire_graph(self):
    pipeline = Pipeline()
    pcoll1 = pipeline | 'pcoll' >> beam.Impulse()
    pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
    pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
    pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1])
    transform = PipelineTest.CustomTransform()
    pcoll5 = pcoll4 | transform

    visitor = PipelineTest.Visitor(visited=[])
    pipeline.visit(visitor)
    self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5},
                     set(visitor.visited))
    self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite))
    self.assertEqual(2, len(visitor.enter_composite))
    self.assertEqual(visitor.enter_composite[1].transform, transform)
    self.assertEqual(visitor.leave_composite[0].transform, transform)
Exemple #6
0
 def test_after_count(self):
   p = Pipeline('DirectRunner')
   result = (p
             | beam.Create([1, 2, 3, 4, 5, 10, 11])
             | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
             | beam.Map(lambda (k, t): TimestampedValue((k, t), t))
             | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3),
                               accumulation_mode=AccumulationMode.DISCARDING)
             | beam.GroupByKey()
             | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
   assert_that(result, equal_to(
       {
           'A-5': {1, 2, 3, 4, 5},
           # A-10, A-11 never emitted due to AfterCount(3) never firing.
           'B-4': {6, 7, 8, 9},
           'B-3': {10, 15, 16},
       }.iteritems()))
Exemple #7
0
    def test_direct_runner_metrics(self):
        from apache_beam.metrics.metric import Metrics

        class MyDoFn(beam.DoFn):
            def start_bundle(self):
                count = Metrics.counter(self.__class__, 'bundles')
                count.inc()

            def finish_bundle(self):
                count = Metrics.counter(self.__class__, 'finished_bundles')
                count.inc()

            def process(self, element):
                count = Metrics.counter(self.__class__, 'elements')
                count.inc()
                distro = Metrics.distribution(self.__class__, 'element_dist')
                distro.update(element)
                return [element]

        runner = DirectRunner()
        p = Pipeline(runner, options=PipelineOptions(self.default_properties))
        pcoll = (p | ptransform.Create([1, 2, 3, 4, 5])
                 | 'Do' >> beam.ParDo(MyDoFn()))
        assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
        result = p.run()
        result.wait_until_finish()
        metrics = result.metrics().query()
        namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__)

        hc.assert_that(
            metrics['counters'],
            hc.contains_inanyorder(
                MetricResult(
                    MetricKey('Do', MetricName(namespace, 'elements')), 5, 5),
                MetricResult(MetricKey('Do', MetricName(namespace, 'bundles')),
                             1, 1),
                MetricResult(
                    MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                    1, 1)))
        hc.assert_that(
            metrics['distributions'],
            hc.contains_inanyorder(
                MetricResult(
                    MetricKey('Do', MetricName(namespace, 'element_dist')),
                    DistributionResult(DistributionData(15, 5, 1, 5)),
                    DistributionResult(DistributionData(15, 5, 1, 5)))))
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    remote_runner.job = apiclient.Job(p._options)
    # Performing configured PTransform overrides here.
    p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Exemple #9
0
    def test_ptransform_overrides(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return isinstance(applied_ptransform.transform, DoubleParDo)

            def get_replacement_transform(self, ptransform):
                if isinstance(ptransform, DoubleParDo):
                    return TripleParDo()
                raise ValueError('Unsupported type of transform: %r' %
                                 ptransform)

        p = Pipeline()
        pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
        assert_that(pcoll, equal_to([3, 6, 9]))

        p.replace_all([MyParDoOverride()])
        p.run()
Exemple #10
0
 def test_no_group_by_key_directly_after_bigquery(self):
   remote_runner = DataflowRunner()
   p = Pipeline(remote_runner,
                options=PipelineOptions([
                    '--dataflow_endpoint=ignored',
                    '--job_name=test-job',
                    '--project=test-project',
                    '--staging_location=ignored',
                    '--temp_location=/dev/null',
                    '--no_auth=True'
                ]))
   rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable'))
   with self.assertRaises(ValueError,
                          msg=('Coder for the GroupByKey operation'
                               '"GroupByKey" is not a key-value coder: '
                               'RowAsDictJsonCoder')):
     unused_invalid = rows | beam.GroupByKey()
Exemple #11
0
    def test_ptransform_override_multiple_inputs(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return isinstance(applied_ptransform.transform,
                                  FlattenAndDouble)

            def get_replacement_transform(self, applied_ptransform):
                return FlattenAndTriple()

        p = Pipeline()
        pcoll1 = p | 'pc1' >> beam.Create([1, 2, 3])
        pcoll2 = p | 'pc2' >> beam.Create([4, 5, 6])
        pcoll3 = (pcoll1, pcoll2) | 'FlattenAndMultiply' >> FlattenAndDouble()
        assert_that(pcoll3, equal_to([3, 6, 9, 12, 15, 18]))

        p.replace_all([MyParDoOverride()])
        p.run()
Exemple #12
0
  def test_default_environment_get_set(self):

    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    test_environment = DockerEnvironment(container_image='test_default_image')
    proto_pipeline, _ = pipeline.to_runner_api(
        return_context=True, default_environment=test_environment)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_image')).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    env = apiclient.Environment(
        [],  # packages
        pipeline_options,
        '2.0.0',  # any environment version
        FAKE_PIPELINE_URL,
        proto_pipeline,
        _sdk_image_overrides={
            '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image'
        })
    worker_pool = env.proto.workerPools[0]

    self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages))

    images_from_proto = [
        sdk_info.containerImage
        for sdk_info in worker_pool.sdkHarnessContainerImages
    ]
    self.assertIn('test_default_image', images_from_proto)
Exemple #13
0
    def run_async(self, transform, options=None):
        """Run the given transform or callable with this runner.

    May return immediately, executing the pipeline in the background.
    The returned result object can be queried for progress, and
    `wait_until_finish` may be called to block until completion.
    """
        # Imported here to avoid circular dependencies.
        # pylint: disable=wrong-import-order, wrong-import-position
        from apache_beam import PTransform
        from apache_beam.pvalue import PBegin
        from apache_beam.pipeline import Pipeline
        p = Pipeline(runner=self, options=options)
        if isinstance(transform, PTransform):
            p | transform
        else:
            transform(PBegin(p))
        return p.run()
Exemple #14
0
    def test_pcollectionview_not_recreated(self):
        pipeline = Pipeline('DirectRunner')
        value = pipeline | 'create1' >> Create([1, 2, 3])
        value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)])
        value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)])
        self.assertEqual(AsSingleton(value), AsSingleton(value))
        self.assertEqual(AsSingleton('new', value, default_value=1),
                         AsSingleton('new', value, default_value=1))
        self.assertNotEqual(AsSingleton(value),
                            AsSingleton('new', value, default_value=1))
        self.assertEqual(AsIter(value), AsIter(value))
        self.assertEqual(AsList(value), AsList(value))
        self.assertEqual(AsDict(value2), AsDict(value2))

        self.assertNotEqual(AsSingleton(value), AsSingleton(value2))
        self.assertNotEqual(AsIter(value), AsIter(value2))
        self.assertNotEqual(AsList(value), AsList(value2))
        self.assertNotEqual(AsDict(value2), AsDict(value3))
Exemple #15
0
    def test_ptransform_overrides(self):
        def my_par_do_matcher(applied_ptransform):
            return isinstance(applied_ptransform.transform, DoubleParDo)

        class MyParDoOverride(PTransformOverride):
            def get_matcher(self):
                return my_par_do_matcher

            def get_replacement_transform(self, ptransform):
                if isinstance(ptransform, DoubleParDo):
                    return TripleParDo()
                raise ValueError('Unsupported type of transform: %r',
                                 ptransform)

        # Using following private variable for testing.
        DirectRunner._PTRANSFORM_OVERRIDES.append(MyParDoOverride())
        with Pipeline() as p:
            pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
            assert_that(pcoll, equal_to([3, 6, 9]))
Exemple #16
0
  def test_pipeline_sdk_not_overridden(self):
    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp',
        '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_prefix/dummy_name:dummy_tag')
        ).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, dict(), pipeline_options)

    self.assertIsNotNone(2, len(proto_pipeline.components.environments))

    from apache_beam.utils import proto_utils
    found_override = False
    for env in proto_pipeline.components.environments.values():
      docker_payload = proto_utils.parse_Bytes(
          env.payload, beam_runner_api_pb2.DockerPayload)
      if docker_payload.container_image.startswith(
          names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY):
        found_override = True

    self.assertFalse(found_override)
  def test_ptransform_overrides(self, file_system_override_mock):
    class MyParDoOverride(PTransformOverride):
      def matches(self, applied_ptransform):
        return isinstance(applied_ptransform.transform, DoubleParDo)

      def get_replacement_transform(self, ptransform):
        if isinstance(ptransform, DoubleParDo):
          return TripleParDo()
        raise ValueError('Unsupported type of transform: %r' % ptransform)

    def get_overrides(unused_pipeline_options):
      return [MyParDoOverride()]

    file_system_override_mock.side_effect = get_overrides

    # Specify DirectRunner as it's the one patched above.
    with Pipeline(runner='BundleBasedDirectRunner') as p:
      pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
      assert_that(pcoll, equal_to([3, 6, 9]))
Exemple #18
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on
        # this test.
        p.run(test_runner_api=False)
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        self.assertUnhashableCountEqual(disp_data, expected_data)
Exemple #19
0
    def test_ptransform_override_side_inputs(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return (isinstance(applied_ptransform.transform, ParDo)
                        and isinstance(applied_ptransform.transform.fn,
                                       AddWithProductDoFn))

            def get_replacement_transform(self, transform):
                return AddThenMultiply()

        p = Pipeline()
        pcoll1 = p | 'pc1' >> beam.Create([2])
        pcoll2 = p | 'pc2' >> beam.Create([3])
        pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6])
        result = pcoll3 | 'Operate' >> beam.ParDo(
            AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2))
        assert_that(result, equal_to([18, 21, 24]))

        p.replace_all([MyParDoOverride()])
        p.run()
Exemple #20
0
  def test_sdk_harness_container_image_overrides(self):
    test_environment = DockerEnvironment(
        container_image='dummy_container_image')
    proto_pipeline, _ = Pipeline().to_runner_api(
      return_context=True, default_environment=test_environment)

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, {'.*dummy.*': 'new_dummy_container_image'})

    self.assertIsNotNone(1, len(proto_pipeline.components.environments))
    env = list(proto_pipeline.components.environments.values())[0]

    from apache_beam.utils import proto_utils
    docker_payload = proto_utils.parse_Bytes(
        env.payload, beam_runner_api_pb2.DockerPayload)

    # Container image should be overridden by a the given override.
    self.assertEqual(
        docker_payload.container_image, 'new_dummy_container_image')
Exemple #21
0
    def test_dataflow_container_image_override(self):
        test_environment = DockerEnvironment(
            container_image='apache/beam_java11_sdk:x.yz.0')
        proto_pipeline, _ = Pipeline().to_runner_api(
            return_context=True, default_environment=test_environment)

        # Accessing non-public method for testing.
        apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
            proto_pipeline, dict())

        self.assertIsNotNone(1, len(proto_pipeline.components.environments))
        env = list(proto_pipeline.components.environments.values())[0]

        from apache_beam.utils import proto_utils
        docker_payload = proto_utils.parse_Bytes(
            env.payload, beam_runner_api_pb2.DockerPayload)

        # Container image should be overridden by a the given override.
        self.assertTrue(
            docker_payload.container_image.startswith(
                names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY))
Exemple #22
0
    def test_sdk_harness_container_image_overrides(self):
        if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__:
            _LOGGER.warning(
                'Skipping test \'test_sdk_harness_container_image_overrides\' since '
                'Dataflow API WorkerPool does not have attribute '
                '\'sdkHarnessContainerImages\'')
            return
        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api',
            '--experiments=use_unified_worker',
            '--temp_location',
            'gs://any-location/temp',
            '--project',
            'dummy_project',
            '--sdk_harness_container_image_overrides',
            '.*dummy.*,new_dummy_container_image',
        ])

        pipeline = Pipeline(options=pipeline_options)

        test_environment = DockerEnvironment(
            container_image='dummy_container_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)
        dataflow_client = apiclient.DataflowApplicationClient(pipeline_options)

        # Accessing non-public method for testing.
        dataflow_client._apply_sdk_environment_overrides(proto_pipeline)

        self.assertIsNotNone(1, len(proto_pipeline.components.environments))
        env = list(proto_pipeline.components.environments.values())[0]

        from apache_beam.utils import proto_utils
        docker_payload = proto_utils.parse_Bytes(
            env.payload, beam_runner_api_pb2.DockerPayload)

        # Container image should be overridden by a the given override.
        self.assertEqual(docker_payload.container_image,
                         'new_dummy_container_image')
Exemple #23
0
    def test_ptransform_overrides(self, file_system_override_mock):
        def my_par_do_matcher(applied_ptransform):
            return isinstance(applied_ptransform.transform, DoubleParDo)

        class MyParDoOverride(PTransformOverride):
            def get_matcher(self):
                return my_par_do_matcher

            def get_replacement_transform(self, ptransform):
                if isinstance(ptransform, DoubleParDo):
                    return TripleParDo()
                raise ValueError('Unsupported type of transform: %r',
                                 ptransform)

        def get_overrides():
            return [MyParDoOverride()]

        file_system_override_mock.side_effect = get_overrides

        with Pipeline() as p:
            pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
            assert_that(pcoll, equal_to([3, 6, 9]))
Exemple #24
0
    def test_visitor_not_sorted(self):
        p = Pipeline()
        # pylint: disable=expression-not-assigned
        from apache_beam.testing.test_stream import TestStream
        p | TestStream().add_elements(['']) | beam.Map(lambda _: _)

        original_graph = p.to_runner_api(return_context=False)
        out_of_order_graph = p.to_runner_api(return_context=False)

        root_id = out_of_order_graph.root_transform_ids[0]
        root = out_of_order_graph.components.transforms[root_id]
        tmp = root.subtransforms[0]
        root.subtransforms[0] = root.subtransforms[1]
        root.subtransforms[1] = tmp

        p = beam.Pipeline().from_runner_api(out_of_order_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_out_of_order = ConsumerTrackingPipelineVisitor()
        p.visit(v_out_of_order)

        p = beam.Pipeline().from_runner_api(original_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_original = ConsumerTrackingPipelineVisitor()
        p.visit(v_original)

        # Convert to string to assert they are equal.
        out_of_order_labels = {
            str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]]
            for k in v_out_of_order.value_to_consumers
        }

        original_labels = {
            str(k): [str(t) for t in v_original.value_to_consumers[k]]
            for k in v_original.value_to_consumers
        }
        self.assertDictEqual(out_of_order_labels, original_labels)
Exemple #25
0
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'city-beam-dataflow'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, City, Country, Latitude,\
     Longitude, major_city FROM kaggle_modeled.City as x'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to filter out dates
    formatted_date_pcoll = query_results | 'Filter Dates' >> beam.ParDo(
        FilterDateFn())

    #write filtered dates to filtered.txt
    formatted_date_pcoll | 'Write filtered dates' >> WriteToText(
        DIR_PATH + 'filtered.txt')

    # group city records by (dt,city) tuple created
    grouped_city_pcoll = formatted_date_pcoll | 'Group by city, dt' >> beam.GroupByKey(
    )

    # display grouped city records
    grouped_city_pcoll | 'Write group by' >> WriteToText(DIR_PATH +
                                                         'grouped.txt')

    #remove duplicate city records
    distinct_city_pcoll = grouped_city_pcoll | 'Delete duplicate records' >> beam.ParDo(
        DedupCityRecordsFn())

    #write resulting PColleciton to output.txt
    distinct_city_pcoll | 'Write output' >> WriteToText(DIR_PATH +
                                                        'output.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle_modeled'
    table_id = 'City_Beam_DF'
    schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,\
    City:STRING,Country:STRING,Latitude:STRING,Longitude:STRING,major_city:INTEGER'

    # write PCollection to new BQ table
    distinct_city_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
Exemple #26
0
 def test_eager_pipeline(self):
     p = Pipeline('EagerRunner')
     self.assertEqual([1, 4, 9],
                      p | Create([1, 2, 3]) | Map(lambda x: x * x))
Exemple #27
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        # TODO: Should not subclass ParDo. Switch to PTransform as soon as
        # composite transforms support display data.
        class SpecialParDo(beam.ParDo):
            def __init__(self, fn, now):
                super(SpecialParDo, self).__init__(fn)
                self.fn = fn
                self.now = now

            # Make this a list to be accessible within closure
            def display_data(self):
                return {
                    'asubcomponent': self.fn,
                    'a_class': SpecialParDo,
                    'a_time': self.now
                }

        class SpecialDoFn(beam.DoFn):
            def display_data(self):
                return {'dofn_value': 42}

            def process(self):
                pass

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        p.run()
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        disp_data = sorted(disp_data, key=lambda x: x['namespace'] + x['key'])
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        expected_data = sorted(expected_data,
                               key=lambda x: x['namespace'] + x['key'])
        self.assertEqual(len(disp_data), 3)
        self.assertEqual(disp_data, expected_data)
 def setUp(self):
     self.pipeline = Pipeline(DirectRunner())
     self.visitor = ConsumerTrackingPipelineVisitor()
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'population-statistics-beam-dataflow-2'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT * FROM kaggle2_modeled.Population_Statistics'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to filter out dates
    transposed_date_pcoll = query_results | 'Transpose Dates' >> beam.ParDo(
        TransposeDateFn())

    #write filtered dates to filtered.txt
    transposed_date_pcoll | 'Write transpose Dates' >> WriteToText(
        DIR_PATH + 'transposed.txt')

    #flatten list to get individual records
    flatten_pcoll = transposed_date_pcoll | 'Flatten lists' >> beam.FlatMap(
        generate_elements)

    #write resulting PColleciton to output.txt
    flatten_pcoll | 'Write output' >> WriteToText(DIR_PATH +
                                                  'output_final_dates.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle2_modeled'
    table_id = 'Population_Statistics_Beam_DF'
    schema_id = 'dt:DATE,countryName:STRING,countryCode:STRING, \
    metric:STRING,metricCode:STRING,statistic:FLOAT'

    # write PCollection to new BQ table
    flatten_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'student-df1'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT sid, fname, lname, dob FROM college_modeled.Student'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write PCollection to log file
    query_results | 'Write log 1' >> WriteToText(DIR_PATH +
                                                 'query_results.txt')

    # apply ParDo to format the student's date of birth
    formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(
        FormatDOBFn())

    # write PCollection to log file
    formatted_dob_pcoll | 'Write log 2' >> WriteToText(
        DIR_PATH + 'formatted_dob_pcoll.txt')

    # group students by sid
    grouped_student_pcoll = formatted_dob_pcoll | 'Group by sid' >> beam.GroupByKey(
    )

    # write PCollection to log file
    grouped_student_pcoll | 'Write log 3' >> WriteToText(
        DIR_PATH + 'grouped_student_pcoll.txt')

    # remove duplicate student records
    distinct_student_pcoll = grouped_student_pcoll | 'Dedup student' >> beam.ParDo(
        DedupStudentRecordsFn())

    # write PCollection to log file
    distinct_student_pcoll | 'Write log 4' >> WriteToText(
        DIR_PATH + 'distinct_student_pcoll.txt')

    dataset_id = 'college_modeled'
    table_id = 'Student_Beam_DF'
    schema_id = 'sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING'

    # write PCollection to new BQ table
    distinct_student_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()