def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline( remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True' ])) remote_runner.job = apiclient.Job(pipeline._options) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run() self.assertEqual( list(remote_runner.proto_pipeline.components.environments.values()), [beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString())])
def run(): PROJECT_ID = 'corvid-276516' BUCKET = 'gs://covid-bucket19' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'format-codes--df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT * FROM covid_staging.googleMobility ORDER BY date, country_region' # passing a query. Shouldn't process more than 1000 records w DR bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) # direct runner is not running in parallel on several workers. DR is local query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # read results and assign them to a new p-collection # call pardo, pipe query results to pardo format_alphaCode_pcoll = query_results | 'Change the country code for Greece, the UK, and Hong Kong. Drop Reunion' >> beam.ParDo(format_alphaCodeFn()) # write PCollection to log file format_alphaCode_pcoll | 'Write log 1' >> WriteToText('geodist_beam.txt') dataset_id = 'covid_modeled' table_id = 'mobility_beam' schema_id = 'code:STRING, country:STRING, date:DATE, average_change:INTEGER, retail_and_recreation:INTEGER, grocery_and_pharmacy:INTEGER, parks:INTEGER, transit_stations:INTEGER, workplaces:INTEGER,residential:INTEGER' # write PCollection to new BQ table format_alphaCode_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'starry-center-266501' # change to your project id BUCKET = 'gs://imdb-beam' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'split-directors-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT directors, tConst FROM imdb_modeled.Directs WHERE directors IN (SELECT DISTINCT nConst FROM imdb_modeled.People)' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write PCollection to log file query_results | 'Write log 1' >> WriteToText(DIR_PATH + 'query_results.txt') # apply ParDo to format the student's date of birth split_directors_pcoll = query_results | 'Split up directors' >> beam.ParDo(SplitDirectorsFn()) # write PCollection to log file split_directors_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'split_directors_pcoll.txt') dataset_id = 'imdb_modeled' table_id = 'Directs_Beam_DF' schema_id = 'director:STRING,tConst:STRING' # write PCollection to new BQ table split_directors_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def test_visit_entire_graph(self): pipeline = Pipeline() pcoll1 = pipeline | 'pcoll' >> beam.Impulse() pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1]) pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1]) pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1]) transform = PipelineTest.CustomTransform() pcoll5 = pcoll4 | transform visitor = PipelineTest.Visitor(visited=[]) pipeline.visit(visitor) self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5}, set(visitor.visited)) self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite)) self.assertEqual(2, len(visitor.enter_composite)) self.assertEqual(visitor.enter_composite[1].transform, transform) self.assertEqual(visitor.leave_composite[0].transform, transform)
def test_after_count(self): p = Pipeline('DirectRunner') result = (p | beam.Create([1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(lambda (k, t): TimestampedValue((k, t), t)) | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v)))) assert_that(result, equal_to( { 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.iteritems()))
def test_direct_runner_metrics(self): from apache_beam.metrics.metric import Metrics class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] runner = DirectRunner() p = Pipeline(runner, options=PipelineOptions(self.default_properties)) pcoll = (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) assert_that(pcoll, equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult(MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5)))))
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned remote_runner.job = apiclient.Job(p._options) # Performing configured PTransform overrides here. p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) super(DataflowRunner, remote_runner).run(p) job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_ptransform_overrides(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r' % ptransform) p = Pipeline() pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9])) p.replace_all([MyParDoOverride()]) p.run()
def test_no_group_by_key_directly_after_bigquery(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--no_auth=True' ])) rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable')) with self.assertRaises(ValueError, msg=('Coder for the GroupByKey operation' '"GroupByKey" is not a key-value coder: ' 'RowAsDictJsonCoder')): unused_invalid = rows | beam.GroupByKey()
def test_ptransform_override_multiple_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, FlattenAndDouble) def get_replacement_transform(self, applied_ptransform): return FlattenAndTriple() p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([1, 2, 3]) pcoll2 = p | 'pc2' >> beam.Create([4, 5, 6]) pcoll3 = (pcoll1, pcoll2) | 'FlattenAndMultiply' >> FlattenAndDouble() assert_that(pcoll3, equal_to([3, 6, 9, 12, 15, 18])) p.replace_all([MyParDoOverride()]) p.run()
def test_default_environment_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment(container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages)) images_from_proto = [ sdk_info.containerImage for sdk_info in worker_pool.sdkHarnessContainerImages ] self.assertIn('test_default_image', images_from_proto)
def run_async(self, transform, options=None): """Run the given transform or callable with this runner. May return immediately, executing the pipeline in the background. The returned result object can be queried for progress, and `wait_until_finish` may be called to block until completion. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import PTransform from apache_beam.pvalue import PBegin from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) if isinstance(transform, PTransform): p | transform else: transform(PBegin(p)) return p.run()
def test_pcollectionview_not_recreated(self): pipeline = Pipeline('DirectRunner') value = pipeline | 'create1' >> Create([1, 2, 3]) value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)]) value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)]) self.assertEqual(AsSingleton(value), AsSingleton(value)) self.assertEqual(AsSingleton('new', value, default_value=1), AsSingleton('new', value, default_value=1)) self.assertNotEqual(AsSingleton(value), AsSingleton('new', value, default_value=1)) self.assertEqual(AsIter(value), AsIter(value)) self.assertEqual(AsList(value), AsList(value)) self.assertEqual(AsDict(value2), AsDict(value2)) self.assertNotEqual(AsSingleton(value), AsSingleton(value2)) self.assertNotEqual(AsIter(value), AsIter(value2)) self.assertNotEqual(AsList(value), AsList(value2)) self.assertNotEqual(AsDict(value2), AsDict(value3))
def test_ptransform_overrides(self): def my_par_do_matcher(applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) class MyParDoOverride(PTransformOverride): def get_matcher(self): return my_par_do_matcher def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r', ptransform) # Using following private variable for testing. DirectRunner._PTRANSFORM_OVERRIDES.append(MyParDoOverride()) with Pipeline() as p: pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9]))
def test_pipeline_sdk_not_overridden(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_prefix/dummy_name:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict(), pipeline_options) self.assertIsNotNone(2, len(proto_pipeline.components.environments)) from apache_beam.utils import proto_utils found_override = False for env in proto_pipeline.components.environments.values(): docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) if docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): found_override = True self.assertFalse(found_override)
def test_ptransform_overrides(self, file_system_override_mock): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r' % ptransform) def get_overrides(unused_pipeline_options): return [MyParDoOverride()] file_system_override_mock.side_effect = get_overrides # Specify DirectRunner as it's the one patched above. with Pipeline(runner='BundleBasedDirectRunner') as p: pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9]))
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on # this test. p.run(test_runner_api=False) job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] self.assertUnhashableCountEqual(disp_data, expected_data)
def test_ptransform_override_side_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return (isinstance(applied_ptransform.transform, ParDo) and isinstance(applied_ptransform.transform.fn, AddWithProductDoFn)) def get_replacement_transform(self, transform): return AddThenMultiply() p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([2]) pcoll2 = p | 'pc2' >> beam.Create([3]) pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6]) result = pcoll3 | 'Operate' >> beam.ParDo( AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2)) assert_that(result, equal_to([18, 21, 24])) p.replace_all([MyParDoOverride()]) p.run()
def test_sdk_harness_container_image_overrides(self): test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = Pipeline().to_runner_api( return_context=True, default_environment=test_environment) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, {'.*dummy.*': 'new_dummy_container_image'}) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertEqual( docker_payload.container_image, 'new_dummy_container_image')
def test_dataflow_container_image_override(self): test_environment = DockerEnvironment( container_image='apache/beam_java11_sdk:x.yz.0') proto_pipeline, _ = Pipeline().to_runner_api( return_context=True, default_environment=test_environment) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict()) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertTrue( docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY))
def test_sdk_harness_container_image_overrides(self): if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__: _LOGGER.warning( 'Skipping test \'test_sdk_harness_container_image_overrides\' since ' 'Dataflow API WorkerPool does not have attribute ' '\'sdkHarnessContainerImages\'') return pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--project', 'dummy_project', '--sdk_harness_container_image_overrides', '.*dummy.*,new_dummy_container_image', ]) pipeline = Pipeline(options=pipeline_options) test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dataflow_client = apiclient.DataflowApplicationClient(pipeline_options) # Accessing non-public method for testing. dataflow_client._apply_sdk_environment_overrides(proto_pipeline) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertEqual(docker_payload.container_image, 'new_dummy_container_image')
def test_ptransform_overrides(self, file_system_override_mock): def my_par_do_matcher(applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) class MyParDoOverride(PTransformOverride): def get_matcher(self): return my_par_do_matcher def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r', ptransform) def get_overrides(): return [MyParDoOverride()] file_system_override_mock.side_effect = get_overrides with Pipeline() as p: pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9]))
def test_visitor_not_sorted(self): p = Pipeline() # pylint: disable=expression-not-assigned from apache_beam.testing.test_stream import TestStream p | TestStream().add_elements(['']) | beam.Map(lambda _: _) original_graph = p.to_runner_api(return_context=False) out_of_order_graph = p.to_runner_api(return_context=False) root_id = out_of_order_graph.root_transform_ids[0] root = out_of_order_graph.components.transforms[root_id] tmp = root.subtransforms[0] root.subtransforms[0] = root.subtransforms[1] root.subtransforms[1] = tmp p = beam.Pipeline().from_runner_api(out_of_order_graph, runner='BundleBasedDirectRunner', options=None) v_out_of_order = ConsumerTrackingPipelineVisitor() p.visit(v_out_of_order) p = beam.Pipeline().from_runner_api(original_graph, runner='BundleBasedDirectRunner', options=None) v_original = ConsumerTrackingPipelineVisitor() p.visit(v_original) # Convert to string to assert they are equal. out_of_order_labels = { str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]] for k in v_out_of_order.value_to_consumers } original_labels = { str(k): [str(t) for t in v_original.value_to_consumers[k]] for k in v_original.value_to_consumers } self.assertDictEqual(out_of_order_labels, original_labels)
def run(): PROJECT_ID = 'electric-spark-266716' # change to your project id BUCKET = 'gs://global_surface_temperatures' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'city-beam-dataflow' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) #create query to select all elements for cleansing sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, City, Country, Latitude,\ Longitude, major_city FROM kaggle_modeled.City as x' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) #read desired table from BigQuery query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) #write inputs to input.txt query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt') # apply ParDo to filter out dates formatted_date_pcoll = query_results | 'Filter Dates' >> beam.ParDo( FilterDateFn()) #write filtered dates to filtered.txt formatted_date_pcoll | 'Write filtered dates' >> WriteToText( DIR_PATH + 'filtered.txt') # group city records by (dt,city) tuple created grouped_city_pcoll = formatted_date_pcoll | 'Group by city, dt' >> beam.GroupByKey( ) # display grouped city records grouped_city_pcoll | 'Write group by' >> WriteToText(DIR_PATH + 'grouped.txt') #remove duplicate city records distinct_city_pcoll = grouped_city_pcoll | 'Delete duplicate records' >> beam.ParDo( DedupCityRecordsFn()) #write resulting PColleciton to output.txt distinct_city_pcoll | 'Write output' >> WriteToText(DIR_PATH + 'output.txt') #create new table in BigQuery dataset_id = 'kaggle_modeled' table_id = 'City_Beam_DF' schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,\ City:STRING,Country:STRING,Latitude:STRING,Longitude:STRING,major_city:INTEGER' # write PCollection to new BQ table distinct_city_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def test_eager_pipeline(self): p = Pipeline('EagerRunner') self.assertEqual([1, 4, 9], p | Create([1, 2, 3]) | Map(lambda x: x * x))
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return { 'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now } class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) p.run() job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace'] + x['key']) nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] expected_data = sorted(expected_data, key=lambda x: x['namespace'] + x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def setUp(self): self.pipeline = Pipeline(DirectRunner()) self.visitor = ConsumerTrackingPipelineVisitor()
def run(): PROJECT_ID = 'electric-spark-266716' # change to your project id BUCKET = 'gs://global_surface_temperatures' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'population-statistics-beam-dataflow-2' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) #create query to select all elements for cleansing sql = 'SELECT * FROM kaggle2_modeled.Population_Statistics' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) #read desired table from BigQuery query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) #write inputs to input.txt query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt') # apply ParDo to filter out dates transposed_date_pcoll = query_results | 'Transpose Dates' >> beam.ParDo( TransposeDateFn()) #write filtered dates to filtered.txt transposed_date_pcoll | 'Write transpose Dates' >> WriteToText( DIR_PATH + 'transposed.txt') #flatten list to get individual records flatten_pcoll = transposed_date_pcoll | 'Flatten lists' >> beam.FlatMap( generate_elements) #write resulting PColleciton to output.txt flatten_pcoll | 'Write output' >> WriteToText(DIR_PATH + 'output_final_dates.txt') #create new table in BigQuery dataset_id = 'kaggle2_modeled' table_id = 'Population_Statistics_Beam_DF' schema_id = 'dt:DATE,countryName:STRING,countryCode:STRING, \ metric:STRING,metricCode:STRING,statistic:FLOAT' # write PCollection to new BQ table flatten_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'student-df1' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT sid, fname, lname, dob FROM college_modeled.Student' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write PCollection to log file query_results | 'Write log 1' >> WriteToText(DIR_PATH + 'query_results.txt') # apply ParDo to format the student's date of birth formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo( FormatDOBFn()) # write PCollection to log file formatted_dob_pcoll | 'Write log 2' >> WriteToText( DIR_PATH + 'formatted_dob_pcoll.txt') # group students by sid grouped_student_pcoll = formatted_dob_pcoll | 'Group by sid' >> beam.GroupByKey( ) # write PCollection to log file grouped_student_pcoll | 'Write log 3' >> WriteToText( DIR_PATH + 'grouped_student_pcoll.txt') # remove duplicate student records distinct_student_pcoll = grouped_student_pcoll | 'Dedup student' >> beam.ParDo( DedupStudentRecordsFn()) # write PCollection to log file distinct_student_pcoll | 'Write log 4' >> WriteToText( DIR_PATH + 'distinct_student_pcoll.txt') dataset_id = 'college_modeled' table_id = 'Student_Beam_DF' schema_id = 'sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING' # write PCollection to new BQ table distinct_student_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()