def test_full_completion(self): # Create dummy file and close it. Note that we need to do this because # Windows does not allow NamedTemporaryFiles to be reopened elsewhere # before the temporary file is closed. dummy_file = tempfile.NamedTemporaryFile(delete=False) dummy_file_name = dummy_file.name dummy_file.close() dummy_dir = tempfile.mkdtemp() remote_runner = DataflowRunner() pipeline = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_file_name, '--job_name=test-job', '--project=test-project', '--staging_location=' + dummy_dir, '--temp_location=/dev/null', '--template_location=' + dummy_file_name, '--no_auth=True'])) pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned pipeline.run().wait_until_finish() with open(dummy_file_name) as template_file: saved_job_dict = json.load(template_file) self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions'] ['options']['project'], 'test-project') self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions'] ['options']['job_name'], 'test-job')
def test_biqquery_read_streaming_fail(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) _ = p | beam.io.Read(beam.io.BigQuerySource('some.table')) with self.assertRaisesRegexp(ValueError, r'source is not currently available'): p.run()
def test_biqquery_read_streaming_fail(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) _ = p | beam.io.Read(beam.io.BigQuerySource('some.table')) with self.assertRaisesRegex(ValueError, r'source is not currently available'): p.run()
def test_min_cpu_platform_flag_is_propagated_to_experiments(self): remote_runner = DataflowRunner() self.default_properties.append('--min_cpu_platform=Intel Haswell') p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() self.assertIn('min_cpu_platform=Intel Haswell', remote_runner.job.options.view_as(DebugOptions).experiments)
def test_remote_runner_translation(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run()
def test_remote_runner_translation(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run()
def test_use_fastavro_experiment_is_added_on_py3_and_onwards(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() self.assertEqual( sys.version_info[0] > 2, remote_runner.job.options.view_as(DebugOptions).lookup_experiment( 'use_fastavro', False))
def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=use_avro') p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() debug_options = remote_runner.job.options.view_as(DebugOptions) self.assertFalse(debug_options.lookup_experiment('use_fastavro', False))
def test_dataflow_worker_jar_flag_non_fnapi_noop(self): remote_runner = DataflowRunner() self.default_properties.append('--experiment=some_other_experiment') self.default_properties.append('--dataflow_worker_jar=test.jar') p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('some_other_experiment', experiments_for_job) self.assertNotIn('use_staged_dataflow_worker_jar', experiments_for_job)
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) # TODO: Should not subclass ParDo. Switch to PTransform as soon as # composite transforms support display data. class SpecialParDo(beam.ParDo): def __init__(self, fn, now): super(SpecialParDo, self).__init__(fn) self.fn = fn self.now = now # Make this a list to be accessible within closure def display_data(self): return {'asubcomponent': self.fn, 'a_class': SpecialParDo, 'a_time': self.now} class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} def process(self): pass now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) p.run() job_dict = json.loads(str(remote_runner.job)) steps = [step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0] step = steps[1] disp_data = step['properties']['display_data'] disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key']) nspace = SpecialParDo.__module__+ '.' expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time'}, {'type': 'STRING', 'namespace': nspace+'SpecialParDo', 'value': nspace+'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo'}, {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn', 'value': 42, 'key': 'dofn_value'}] expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key']) self.assertEqual(len(disp_data), 3) self.assertEqual(disp_data, expected_data)
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_streaming_create_translation(self): remote_runner = DataflowRunner() self.default_properties.append("--streaming") p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() job_dict = json.loads(str(remote_runner.job)) self.assertEqual(len(job_dict[u'steps']), 2) self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead') self.assertEqual( job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'], '_starting_signal/') self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
def test_streaming_engine_flag_adds_windmill_experiments(self): remote_runner = DataflowRunner() self.default_properties.append('--streaming') self.default_properties.append('--enable_streaming_engine') self.default_properties.append('--experiment=some_other_experiment') p = Pipeline(remote_runner, PipelineOptions(self.default_properties)) p | ptransform.Create([1]) # pylint: disable=expression-not-assigned p.run() experiments_for_job = ( remote_runner.job.options.view_as(DebugOptions).experiments) self.assertIn('enable_streaming_engine', experiments_for_job) self.assertIn('enable_windmill_service', experiments_for_job) self.assertIn('some_other_experiment', experiments_for_job)
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run() self.assertEqual( list(remote_runner.proto_pipeline.components.environments.values()), [beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString())])
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline( remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True' ])) remote_runner.job = apiclient.Job(pipeline._options) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def test_ptransform_override_multiple_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, FlattenAndDouble) def get_replacement_transform(self, applied_ptransform): return FlattenAndTriple() p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([1, 2, 3]) pcoll2 = p | 'pc2' >> beam.Create([4, 5, 6]) pcoll3 = (pcoll1, pcoll2) | 'FlattenAndMultiply' >> FlattenAndDouble() assert_that(pcoll3, equal_to([3, 6, 9, 12, 15, 18])) p.replace_all([MyParDoOverride()]) p.run()
def test_direct_runner_metrics(self): class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): gauge = Metrics.gauge(self.__class__, 'latest_element') gauge.set(element) count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] runner = DirectRunner() p = Pipeline(runner, options=PipelineOptions(self.default_properties)) pcoll = (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) assert_that(pcoll, equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5))))) gauge_result = metrics['gauges'][0] hc.assert_that( gauge_result.key, hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element')))) hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
def test_ptransform_overrides(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) def get_replacement_transform(self, ptransform): if isinstance(ptransform, DoubleParDo): return TripleParDo() raise ValueError('Unsupported type of transform: %r' % ptransform) p = Pipeline() pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo() assert_that(pcoll, equal_to([3, 6, 9])) p.replace_all([MyParDoOverride()]) p.run()
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True'])) remote_runner.job = apiclient.Job(pipeline._options) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def run(): PROJECT_ID = 'electric-spark-266716' # change to your project id BUCKET = 'gs://global_surface_temperatures' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'country-beam-dataflow' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) #create query to select all elements for cleansing sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, Country \ FROM kaggle_modeled.Country as x' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) #read desired table from BigQuery query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) #write inputs to input.txt query_results | 'Write input' >> WriteToText(DIR_PATH + 'input_country.txt') # apply ParDo to filter out dates formatted_country_pcoll = query_results | 'Filter Dates' >> beam.ParDo( FilterDateFn()) # display filtered countries formatted_country_pcoll | 'Write filtered dates' >> WriteToText( DIR_PATH + 'output_country.txt') #create new table in BigQuery dataset_id = 'kaggle_modeled' table_id = 'Country_Beam_DF' schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,Country:STRING' # write PCollection to new BQ table formatted_country_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'takes-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' p = Pipeline(options=options) takes_sql = 'SELECT sid, cno, grade FROM college_workflow_modeled.Takes' class_sql = 'SELECT cid, cno FROM college_workflow_modeled.Class' takes_pcoll = p | 'Read from BQ Takes' >> beam.io.Read( beam.io.BigQuerySource(query=takes_sql, use_standard_sql=True)) class_pcoll = p | 'Read from BQ Class' >> beam.io.Read( beam.io.BigQuerySource(query=class_sql, use_standard_sql=True)) # write PCollections to log files takes_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'takes_query_results.txt') class_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'class_query_results.txt') # ParDo with side-input norm_takes_pcoll = takes_pcoll | 'Normalize Record' >> beam.ParDo( NormalizeDoFn(), beam.pvalue.AsList(class_pcoll)) # write PCollection to log file norm_takes_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'norm_takes_pcoll.txt') dataset_id = 'college_workflow_modeled' table_id = 'Takes_Beam_DF' schema_id = 'sid:STRING,cid:STRING,grade:STRING' # write PCollection to new BQ table norm_takes_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def test_direct_runner_metrics(self): class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): gauge = Metrics.gauge(self.__class__, 'latest_element') gauge.set(element) count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] p = Pipeline(DirectRunner()) pcoll = (p | beam.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) assert_that(pcoll, equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5))))) gauge_result = metrics['gauges'][0] hc.assert_that( gauge_result.key, hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element')))) hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'student-df2' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT sid, fname, lname, dob, status FROM college_workflow_modeled.Student' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # standardize the students' date of birth formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(FormatDOBFn()) # write PCollection to log file formatted_dob_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'formatted_dob_pcoll.txt') # group students by sid grouped_student_pcoll = formatted_dob_pcoll | 'Group by sid' >> beam.GroupByKey() # write PCollection to log file #grouped_student_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'grouped_student_pcoll.txt') # remove duplicate student records distinct_student_pcoll = grouped_student_pcoll | 'Dedup student records' >> beam.ParDo(DedupStudentRecordsFn()) # write PCollection to log file distinct_student_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'distinct_student_pcoll.txt') dataset_id = 'college_workflow_modeled' table_id = 'Student_Beam_DF' schema_id = 'sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING' # write PCollection to new BQ table distinct_student_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(self, transform, options=None): """Run the given transform with this runner. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) p | transform return p.run()
def run(self, transform, options=None): """Run the given transform with this runner. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) p | transform return p.run()
def run(): PROJECT_ID = 'studied-brand-266702' # change to your project id BUCKET = 'gs://beam_cs327e_project' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'vaccination-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # run BigQuery query on dataset sql = 'SELECT * FROM vaers_modeled.Vaccination' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write input PCollection to input.txt input_pcoll | 'Write input_pcoll log 1' >> WriteToText( DIR_PATH + 'input_vaccination.txt') # standardize vaccination V_FUNDBY, VAX_ROUTE and VAX_SITE unknown/empty attribute formatted_vaccination_pcoll = input_pcoll | 'Format Unknown Values' >> beam.ParDo( FormatUnknownFn()) # write PCollection to log file formatted_vaccination_pcoll | 'Write log 2' >> WriteToText( DIR_PATH + 'formatted_unknown_pcoll.txt') # specify id and schema dataset_id = 'vaers_modeled' table_id = 'Vaccination_Beam_DF' schema_id = 'VACCINATION_ID:INTEGER, VAERS_ID:INTEGER, VAX_DATE:DATE, VAX_ID:INTEGER, MANU_ID:INTEGER, V_ADMINBY:STRING, V_FUNDBY:STRING, VAX_ROUTE:STRING, VAX_SITE:STRING' # write output PCollection to new BQ table formatted_vaccination_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'teacher-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT tid, instructor, dept FROM college_workflow_modeled.Teacher' query_results = p | 'Read from BigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=sql, use_standard_sql=True)) query_results | 'Write log 1' >> WriteToText('query_results.txt') teacher_pcoll = query_results | 'Standardize' >> beam.ParDo( StandardizeDoFn()) teacher_pcoll | 'Write log 2' >> WriteToText('formatted_teacher_pcoll.txt') # group records by tid grouped_pcoll = teacher_pcoll | 'Group by tid' >> beam.GroupByKey() grouped_pcoll | 'Write log 3' >> WriteToText('grouped_teacher.txt') # remove duplicates distinct_pcoll = grouped_pcoll | 'Dedup' >> beam.ParDo(DedupRecordsDoFn()) distinct_pcoll | 'Write log 4' >> WriteToText('distinct_teacher.txt') dataset_id = 'college_workflow_modeled' table_id = 'Teacher_Beam_DF' schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING' distinct_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def test_remote_runner_display_data(self): remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) now = datetime.now() # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> SpecialParDo(SpecialDoFn(), now)) # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on # this test. p.run(test_runner_api=False) job_dict = json.loads(str(remote_runner.job)) steps = [ step for step in job_dict['steps'] if len(step['properties'].get('display_data', [])) > 0 ] step = steps[1] disp_data = step['properties']['display_data'] nspace = SpecialParDo.__module__ + '.' expected_data = [{ 'type': 'TIMESTAMP', 'namespace': nspace + 'SpecialParDo', 'value': DisplayDataItem._format_value(now, 'TIMESTAMP'), 'key': 'a_time' }, { 'type': 'STRING', 'namespace': nspace + 'SpecialParDo', 'value': nspace + 'SpecialParDo', 'key': 'a_class', 'shortValue': 'SpecialParDo' }, { 'type': 'INTEGER', 'namespace': nspace + 'SpecialDoFn', 'value': 42, 'key': 'dofn_value' }] self.assertUnhashableCountEqual(disp_data, expected_data)
def test_ptransform_override_side_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return (isinstance(applied_ptransform.transform, ParDo) and isinstance(applied_ptransform.transform.fn, AddWithProductDoFn)) def get_replacement_transform(self, transform): return AddThenMultiply() p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([2]) pcoll2 = p | 'pc2' >> beam.Create([3]) pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6]) result = pcoll3 | 'Operate' >> beam.ParDo( AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2)) assert_that(result, equal_to([18, 21, 24])) p.replace_all([MyParDoOverride()]) p.run()
def run(): PROJECT_ID = 'starry-center-266501' # change to your project id BUCKET = 'gs://imdb-beam' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'format-date-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT * FROM bollywood_modeled.bollywoodTitles' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write PCollection to log file query_results | 'Write log 1' >> WriteToText(DIR_PATH + 'query_results.txt') # apply ParDo to format the student's date of birth formatDate_pcoll = query_results | 'Format the dates' >> beam.ParDo( FormatDateFn()) # write PCollection to log file formatDate_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'formatDate_pcoll.txt') dataset_id = 'bollywood_modeled' table_id = 'bollywoodTitles_Beam_DF' schema_id = 'title:STRING,releaseDate:DATE,croresGrossed:NUMERIC' # write PCollection to new BQ table formatDate_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'responsive-cab-267123' # change to your project id BUCKET = 'gs://bmease_cs327e' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'foodmap-df' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create beam pipeline using local runner p = Pipeline(options=options) # get average price per year for each food sql = "SELECT LOWER(product_name) AS product_name, product_id,p.aisle_id FROM instacart_modeled.Products p WHERE p.product_name not like '%Filters%' and p.aisle_id NOT IN (11,20,22,25,44,55,73,80,109,118,126,127,132,133,10,54,60,74,75,85,87,101,111,114,56,82,102)" bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write PCollection to input file query_results | 'Write to input.txt' >> WriteToText(DIR_PATH + 'input.txt') # apply ParDo to format the key, value pairs # key is the food_id and value is a tuple of year and average price that year nom_match_pcoll = query_results | 'Food and matches from nom' >> beam.ParDo( MatchProductFn()) # write PCollection to output file nom_match_pcoll | 'Write to output.txt' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'USDA_ERS_modeled' table_id = 'Food_Map_Beam_DF' schema_id = 'food_id:INTEGER,product_id:INTEGER' # write PCollection to new BQ table nom_match_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'swift-area-266618' # change to your project id BUCKET = 'gs://nullbusters_data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'student-df5' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT imdb_title_id, title, original_title, year, genre, duration, country, language, director, writer, production_company, actors, description, avg_vote, votes, budget, usa_gross_income, worlwide_gross_income, metascore, reviews_from_users, reviews_from_critics FROM imdb_modeled.Movies WHERE usa_gross_income IS NOT NULL and worlwide_gross_income IS NOT NULL' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) query_results | 'Write log Input' >> WriteToText('input.txt') # apply ParDo to format directors birth year and death years to be ints formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo( FormatYearsFn()) # write PCollection to log file formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'imdb_modeled' table_id = 'Movies_Beam_DF' schema_id = 'imdb_title_id:STRING, title:STRING, original_title:STRING, year:INTEGER, genre:STRING, duration:INTEGER, country:STRING, language:STRING, director:STRING, writer:STRING, production_company:STRING, actors:STRING, description:STRING, avg_votes:FLOAT, votes:INTEGER, budget_currency:STRING, budget:INTEGER, usa_gross_income:INTEGER, worlwide_gross_income_currency:STRING, worlwide_gross_income:INTEGER, metascore:FLOAT, reviews_from_users:FLOAT, reviews_from_critics:FLOAT' # write PCollection to new BQ table formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'swift-area-266618' # change to your project id BUCKET = 'gs://nullbusters_data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'directors' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT name, birth_name, height, bio, birth_details, birth_year, place_of_birth, death_details, death_year, spouses, divorces, children, known_for_titles, imdb_title_id, director_name_id, category, reason_of_death FROM imdb_modeled.Directors WHERE birth_year IS NOT NULL AND death_year IS NOT NULL' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) query_results | 'Write log Input' >> WriteToText('input.txt') # apply ParDo to format directors birth year and death years to be ints formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo( FormatYearsFn()) # write PCollection to log file formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'imdb_modeled' table_id = 'Directors_Beam_DF' schema_id = 'name:STRING,birth_name:STRING,height:FLOAT,bio:STRING,birth_details:STRING,birth_year:INTEGER,place_of_birth:STRING,death_details:STRING,death_year:INTEGER,spouses:INTEGER,divorces:INTEGER,children:STRING,known_for_titles:STRING,imdb_title_id:STRING,category:STRING,reason_of_death:STRING,director_name_id:STRING' # write PCollection to new BQ table formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def test_direct_runner_metrics(self): from apache_beam.metrics.metric import Metrics class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] runner = DirectRunner() p = Pipeline(runner, options=PipelineOptions(self.default_properties)) # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5)))))
def run(self, transform, options=None): """Run the given transform or callable with this runner. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import PTransform from apache_beam.pvalue import PBegin from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) if isinstance(transform, PTransform): p | transform else: transform(PBegin(p)) return p.run()
def run(self, transform, options=None): """Run the given transform or callable with this runner. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import PTransform from apache_beam.pvalue import PBegin from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) if isinstance(transform, PTransform): p | transform else: transform(PBegin(p)) return p.run()
def run(): PROJECT_ID = 'spry-cosine-266801' BUCKET = 'gs://icyhot-pack_beam' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'location-df' # dataflow does not like '_' or special characters google_cloud_options.staging_location = BUCKET + '/staging' #req* google_cloud_options.temp_location = BUCKET + '/temp' #req* options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) sql = 'SELECT id, province_state, country_region FROM covid19_jhu_csse_modeled.location_id' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # format US formatted_us_pcoll = query_results | 'Format US' >> beam.ParDo( FormatUSFn()) # write PCollection to log file formatted_us_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'formatted_us_pcoll.txt') dataset_id = 'covid19_jhu_csse_modeled' table_id = 'location_id_Beam_DF' schema_id = 'id:INTEGER, province_state:STRING, country_region:STRING' # write PCollection to new BQ table formatted_us_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id BUCKET = 'gs://beam-output-data' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'location' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' p = Pipeline(options=options) sql = 'SELECT * FROM covid_19_modeled.Location_SQL_1' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # extract city from state state_pcoll = query_results | 'Format State' >> beam.ParDo(FormatStateFn()) grouped_pcoll = state_pcoll | 'Group Locations' >> beam.GroupByKey() unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo( RemoveDuplicatesFn()) dataset_id = 'covid_19_modeled' table_id = 'Location_Beam_DF' schema_id = 'id:INTEGER,city:STRING,state:STRING,country:STRING,latitude:NUMERIC,longitude:NUMERIC,fips:INTEGER,admin2:STRING,combined_key:STRING' # write PCollection to BQ table unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run_async(self, transform, options=None): """Run the given transform or callable with this runner. May return immediately, executing the pipeline in the background. The returned result object can be queried for progress, and `wait_until_finish` may be called to block until completion. """ # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import PTransform from apache_beam.pvalue import PBegin from apache_beam.pipeline import Pipeline p = Pipeline(runner=self, options=options) if isinstance(transform, PTransform): p | transform else: transform(PBegin(p)) return p.run()