def test_full_completion(self):
    # Create dummy file and close it.  Note that we need to do this because
    # Windows does not allow NamedTemporaryFiles to be reopened elsewhere
    # before the temporary file is closed.
    dummy_file = tempfile.NamedTemporaryFile(delete=False)
    dummy_file_name = dummy_file.name
    dummy_file.close()

    dummy_dir = tempfile.mkdtemp()

    remote_runner = DataflowRunner()
    pipeline = Pipeline(remote_runner,
                        options=PipelineOptions([
                            '--dataflow_endpoint=ignored',
                            '--sdk_location=' + dummy_file_name,
                            '--job_name=test-job',
                            '--project=test-project',
                            '--staging_location=' + dummy_dir,
                            '--temp_location=/dev/null',
                            '--template_location=' + dummy_file_name,
                            '--no_auth=True']))

    pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned
    pipeline.run().wait_until_finish()
    with open(dummy_file_name) as template_file:
      saved_job_dict = json.load(template_file)
      self.assertEqual(
          saved_job_dict['environment']['sdkPipelineOptions']
          ['options']['project'], 'test-project')
      self.assertEqual(
          saved_job_dict['environment']['sdkPipelineOptions']
          ['options']['job_name'], 'test-job')
 def test_biqquery_read_streaming_fail(self):
   remote_runner = DataflowRunner()
   self.default_properties.append("--streaming")
   p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
   _ = p | beam.io.Read(beam.io.BigQuerySource('some.table'))
   with self.assertRaisesRegexp(ValueError,
                                r'source is not currently available'):
     p.run()
Example #3
0
 def test_biqquery_read_streaming_fail(self):
     remote_runner = DataflowRunner()
     self.default_properties.append("--streaming")
     p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
     _ = p | beam.io.Read(beam.io.BigQuerySource('some.table'))
     with self.assertRaisesRegex(ValueError,
                                 r'source is not currently available'):
         p.run()
Example #4
0
  def test_min_cpu_platform_flag_is_propagated_to_experiments(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--min_cpu_platform=Intel Haswell')

    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    p.run()
    self.assertIn('min_cpu_platform=Intel Haswell',
                  remote_runner.job.options.view_as(DebugOptions).experiments)
  def test_remote_runner_translation(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
     | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
     | ptransform.GroupByKey())
    p.run()
Example #6
0
    def test_remote_runner_translation(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
         | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
         | ptransform.GroupByKey())
        p.run()
    def test_use_fastavro_experiment_is_added_on_py3_and_onwards(self):
        remote_runner = DataflowRunner()

        p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
        p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        p.run()

        self.assertEqual(
            sys.version_info[0] > 2,
            remote_runner.job.options.view_as(DebugOptions).lookup_experiment(
                'use_fastavro', False))
Example #8
0
  def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=use_avro')

    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    p.run()

    debug_options = remote_runner.job.options.view_as(DebugOptions)

    self.assertFalse(debug_options.lookup_experiment('use_fastavro', False))
    def test_dataflow_worker_jar_flag_non_fnapi_noop(self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--experiment=some_other_experiment')
        self.default_properties.append('--dataflow_worker_jar=test.jar')

        p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
        p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        p.run()

        experiments_for_job = (
            remote_runner.job.options.view_as(DebugOptions).experiments)
        self.assertIn('some_other_experiment', experiments_for_job)
        self.assertNotIn('use_staged_dataflow_worker_jar', experiments_for_job)
  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    p.run()
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[1]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    p.run()
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Example #12
0
    def test_streaming_create_translation(self):
        remote_runner = DataflowRunner()
        self.default_properties.append("--streaming")
        p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
        p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        p.run()
        job_dict = json.loads(str(remote_runner.job))
        self.assertEqual(len(job_dict[u'steps']), 2)

        self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
        self.assertEqual(
            job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
            '_starting_signal/')
        self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Example #13
0
    def test_streaming_engine_flag_adds_windmill_experiments(self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--streaming')
        self.default_properties.append('--enable_streaming_engine')
        self.default_properties.append('--experiment=some_other_experiment')

        p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
        p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        p.run()

        experiments_for_job = (
            remote_runner.job.options.view_as(DebugOptions).experiments)
        self.assertIn('enable_streaming_engine', experiments_for_job)
        self.assertIn('enable_windmill_service', experiments_for_job)
        self.assertIn('some_other_experiment', experiments_for_job)
Example #14
0
 def test_environment_override_translation(self):
   self.default_properties.append('--experiments=beam_fn_api')
   self.default_properties.append('--worker_harness_container_image=FOO')
   remote_runner = DataflowRunner()
   p = Pipeline(remote_runner,
                options=PipelineOptions(self.default_properties))
   (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
    | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
    | ptransform.GroupByKey())
   p.run()
   self.assertEqual(
       list(remote_runner.proto_pipeline.components.environments.values()),
       [beam_runner_api_pb2.Environment(
           urn=common_urns.environments.DOCKER.urn,
           payload=beam_runner_api_pb2.DockerPayload(
               container_image='FOO').SerializeToString())])
Example #15
0
    def test_bad_path(self):
        dummy_sdk_file = tempfile.NamedTemporaryFile()
        remote_runner = DataflowRunner()
        pipeline = Pipeline(
            remote_runner,
            options=PipelineOptions([
                '--dataflow_endpoint=ignored',
                '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job',
                '--project=test-project', '--staging_location=ignored',
                '--temp_location=/dev/null', '--template_location=/bad/path',
                '--no_auth=True'
            ]))
        remote_runner.job = apiclient.Job(pipeline._options)

        with self.assertRaises(IOError):
            pipeline.run().wait_until_finish()
Example #16
0
  def test_ptransform_override_multiple_inputs(self):
    class MyParDoOverride(PTransformOverride):
      def matches(self, applied_ptransform):
        return isinstance(applied_ptransform.transform, FlattenAndDouble)

      def get_replacement_transform(self, applied_ptransform):
        return FlattenAndTriple()

    p = Pipeline()
    pcoll1 = p | 'pc1' >> beam.Create([1, 2, 3])
    pcoll2 = p | 'pc2' >> beam.Create([4, 5, 6])
    pcoll3 = (pcoll1, pcoll2) | 'FlattenAndMultiply' >> FlattenAndDouble()
    assert_that(pcoll3, equal_to([3, 6, 9, 12, 15, 18]))

    p.replace_all([MyParDoOverride()])
    p.run()
Example #17
0
  def test_direct_runner_metrics(self):

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        gauge = Metrics.gauge(self.__class__, 'latest_element')
        gauge.set(element)
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    runner = DirectRunner()
    p = Pipeline(runner,
                 options=PipelineOptions(self.default_properties))
    pcoll = (p | ptransform.Create([1, 2, 3, 4, 5])
             | 'Do' >> beam.ParDo(MyDoFn()))
    assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))

    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))

    gauge_result = metrics['gauges'][0]
    hc.assert_that(
        gauge_result.key,
        hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element'))))
    hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
    hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
Example #18
0
    def test_ptransform_overrides(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return isinstance(applied_ptransform.transform, DoubleParDo)

            def get_replacement_transform(self, ptransform):
                if isinstance(ptransform, DoubleParDo):
                    return TripleParDo()
                raise ValueError('Unsupported type of transform: %r' %
                                 ptransform)

        p = Pipeline()
        pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
        assert_that(pcoll, equal_to([3, 6, 9]))

        p.replace_all([MyParDoOverride()])
        p.run()
  def test_bad_path(self):
    dummy_sdk_file = tempfile.NamedTemporaryFile()
    remote_runner = DataflowRunner()
    pipeline = Pipeline(remote_runner,
                        options=PipelineOptions([
                            '--dataflow_endpoint=ignored',
                            '--sdk_location=' + dummy_sdk_file.name,
                            '--job_name=test-job',
                            '--project=test-project',
                            '--staging_location=ignored',
                            '--temp_location=/dev/null',
                            '--template_location=/bad/path',
                            '--no_auth=True']))
    remote_runner.job = apiclient.Job(pipeline._options)

    with self.assertRaises(IOError):
      pipeline.run().wait_until_finish()
Example #20
0
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'country-beam-dataflow'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, Country \
    FROM kaggle_modeled.Country as x'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH +
                                                 'input_country.txt')

    # apply ParDo to filter out dates
    formatted_country_pcoll = query_results | 'Filter Dates' >> beam.ParDo(
        FilterDateFn())

    # display filtered countries
    formatted_country_pcoll | 'Write filtered dates' >> WriteToText(
        DIR_PATH + 'output_country.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle_modeled'
    table_id = 'Country_Beam_DF'
    schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,Country:STRING'

    # write PCollection to new BQ table
    formatted_country_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'takes-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    takes_sql = 'SELECT sid, cno, grade FROM college_workflow_modeled.Takes'
    class_sql = 'SELECT cid, cno FROM college_workflow_modeled.Class'

    takes_pcoll = p | 'Read from BQ Takes' >> beam.io.Read(
        beam.io.BigQuerySource(query=takes_sql, use_standard_sql=True))
    class_pcoll = p | 'Read from BQ Class' >> beam.io.Read(
        beam.io.BigQuerySource(query=class_sql, use_standard_sql=True))

    # write PCollections to log files
    takes_pcoll | 'Write log 1' >> WriteToText(DIR_PATH +
                                               'takes_query_results.txt')
    class_pcoll | 'Write log 2' >> WriteToText(DIR_PATH +
                                               'class_query_results.txt')

    # ParDo with side-input
    norm_takes_pcoll = takes_pcoll | 'Normalize Record' >> beam.ParDo(
        NormalizeDoFn(), beam.pvalue.AsList(class_pcoll))

    # write PCollection to log file
    norm_takes_pcoll | 'Write log 3' >> WriteToText(DIR_PATH +
                                                    'norm_takes_pcoll.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Takes_Beam_DF'
    schema_id = 'sid:STRING,cid:STRING,grade:STRING'

    # write PCollection to new BQ table
    norm_takes_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
Example #22
0
  def test_direct_runner_metrics(self):

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        gauge = Metrics.gauge(self.__class__, 'latest_element')
        gauge.set(element)
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    p = Pipeline(DirectRunner())
    pcoll = (p | beam.Create([1, 2, 3, 4, 5])
             | 'Do' >> beam.ParDo(MyDoFn()))
    assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))

    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))

    gauge_result = metrics['gauges'][0]
    hc.assert_that(
        gauge_result.key,
        hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element'))))
    hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
    hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
def run():         
    PROJECT_ID = 'cs327e-sp2020' # change to your project id
    BUCKET = 'gs://beam-output-data' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'student-df2'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT sid, fname, lname, dob, status FROM college_workflow_modeled.Student'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # standardize the students' date of birth  
    formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(FormatDOBFn())

    # write PCollection to log file
    formatted_dob_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'formatted_dob_pcoll.txt')

    # group students by sid
    grouped_student_pcoll = formatted_dob_pcoll | 'Group by sid' >> beam.GroupByKey()

    # write PCollection to log file
    #grouped_student_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'grouped_student_pcoll.txt')

    # remove duplicate student records
    distinct_student_pcoll = grouped_student_pcoll | 'Dedup student records' >> beam.ParDo(DedupStudentRecordsFn())

    # write PCollection to log file
    distinct_student_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'distinct_student_pcoll.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Student_Beam_DF'
    schema_id = 'sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING'

    # write PCollection to new BQ table
    distinct_student_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
         
    result = p.run()
    result.wait_until_finish()      
Example #24
0
 def run(self, transform, options=None):
   """Run the given transform with this runner.
   """
   # Imported here to avoid circular dependencies.
   # pylint: disable=wrong-import-order, wrong-import-position
   from apache_beam.pipeline import Pipeline
   p = Pipeline(runner=self, options=options)
   p | transform
   return p.run()
Example #25
0
 def run(self, transform, options=None):
     """Run the given transform with this runner.
 """
     # Imported here to avoid circular dependencies.
     # pylint: disable=wrong-import-order, wrong-import-position
     from apache_beam.pipeline import Pipeline
     p = Pipeline(runner=self, options=options)
     p | transform
     return p.run()
def run():
    PROJECT_ID = 'studied-brand-266702'  # change to your project id
    BUCKET = 'gs://beam_cs327e_project'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'vaccination-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    # run BigQuery query on dataset
    sql = 'SELECT * FROM vaers_modeled.Vaccination'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write input PCollection to input.txt
    input_pcoll | 'Write input_pcoll log 1' >> WriteToText(
        DIR_PATH + 'input_vaccination.txt')

    # standardize vaccination V_FUNDBY, VAX_ROUTE and VAX_SITE unknown/empty attribute
    formatted_vaccination_pcoll = input_pcoll | 'Format Unknown Values' >> beam.ParDo(
        FormatUnknownFn())

    # write PCollection to log file
    formatted_vaccination_pcoll | 'Write log 2' >> WriteToText(
        DIR_PATH + 'formatted_unknown_pcoll.txt')

    # specify id and schema
    dataset_id = 'vaers_modeled'
    table_id = 'Vaccination_Beam_DF'
    schema_id = 'VACCINATION_ID:INTEGER, VAERS_ID:INTEGER, VAX_DATE:DATE, VAX_ID:INTEGER, MANU_ID:INTEGER, V_ADMINBY:STRING, V_FUNDBY:STRING, VAX_ROUTE:STRING, VAX_SITE:STRING'

    # write output PCollection to new BQ table
    formatted_vaccination_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
Example #27
0
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'teacher-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT tid, instructor, dept FROM college_workflow_modeled.Teacher'
    query_results = p | 'Read from BigQuery' >> beam.io.Read(
        beam.io.BigQuerySource(query=sql, use_standard_sql=True))

    query_results | 'Write log 1' >> WriteToText('query_results.txt')

    teacher_pcoll = query_results | 'Standardize' >> beam.ParDo(
        StandardizeDoFn())

    teacher_pcoll | 'Write log 2' >> WriteToText('formatted_teacher_pcoll.txt')

    # group records by tid
    grouped_pcoll = teacher_pcoll | 'Group by tid' >> beam.GroupByKey()

    grouped_pcoll | 'Write log 3' >> WriteToText('grouped_teacher.txt')

    # remove duplicates
    distinct_pcoll = grouped_pcoll | 'Dedup' >> beam.ParDo(DedupRecordsDoFn())

    distinct_pcoll | 'Write log 4' >> WriteToText('distinct_teacher.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Teacher_Beam_DF'
    schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING'

    distinct_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
Example #28
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on
        # this test.
        p.run(test_runner_api=False)
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        self.assertUnhashableCountEqual(disp_data, expected_data)
Example #29
0
    def test_ptransform_override_side_inputs(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return (isinstance(applied_ptransform.transform, ParDo)
                        and isinstance(applied_ptransform.transform.fn,
                                       AddWithProductDoFn))

            def get_replacement_transform(self, transform):
                return AddThenMultiply()

        p = Pipeline()
        pcoll1 = p | 'pc1' >> beam.Create([2])
        pcoll2 = p | 'pc2' >> beam.Create([3])
        pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6])
        result = pcoll3 | 'Operate' >> beam.ParDo(
            AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2))
        assert_that(result, equal_to([18, 21, 24]))

        p.replace_all([MyParDoOverride()])
        p.run()
Example #30
0
def run():
    PROJECT_ID = 'starry-center-266501'  # change to your project id
    BUCKET = 'gs://imdb-beam'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'format-date-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT * FROM bollywood_modeled.bollywoodTitles'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write PCollection to log file
    query_results | 'Write log 1' >> WriteToText(DIR_PATH +
                                                 'query_results.txt')

    # apply ParDo to format the student's date of birth
    formatDate_pcoll = query_results | 'Format the dates' >> beam.ParDo(
        FormatDateFn())

    # write PCollection to log file
    formatDate_pcoll | 'Write log 2' >> WriteToText(DIR_PATH +
                                                    'formatDate_pcoll.txt')

    dataset_id = 'bollywood_modeled'
    table_id = 'bollywoodTitles_Beam_DF'
    schema_id = 'title:STRING,releaseDate:DATE,croresGrossed:NUMERIC'

    # write PCollection to new BQ table
    formatDate_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()
Example #31
0
def run():
    PROJECT_ID = 'responsive-cab-267123'  # change to your project id
    BUCKET = 'gs://bmease_cs327e'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'foodmap-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'
    # Create beam pipeline using local runner
    p = Pipeline(options=options)

    # get average price per year for each food
    sql = "SELECT LOWER(product_name) AS product_name, product_id,p.aisle_id FROM instacart_modeled.Products p WHERE p.product_name not like '%Filters%' and p.aisle_id NOT IN (11,20,22,25,44,55,73,80,109,118,126,127,132,133,10,54,60,74,75,85,87,101,111,114,56,82,102)"
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)
    # write PCollection to input file
    query_results | 'Write to input.txt' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to format the key, value pairs
    # key is the food_id and value is a tuple of year and average price that year
    nom_match_pcoll = query_results | 'Food and matches from nom' >> beam.ParDo(
        MatchProductFn())

    # write PCollection to output file
    nom_match_pcoll | 'Write to output.txt' >> WriteToText(DIR_PATH +
                                                           'output.txt')

    dataset_id = 'USDA_ERS_modeled'
    table_id = 'Food_Map_Beam_DF'
    schema_id = 'food_id:INTEGER,product_id:INTEGER'

    # write PCollection to new BQ table
    nom_match_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'swift-area-266618'  # change to your project id
    BUCKET = 'gs://nullbusters_data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'student-df5'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT imdb_title_id, title, original_title, year, genre, duration, country, language, director, writer, production_company, actors, description, avg_vote, votes, budget, usa_gross_income, worlwide_gross_income, metascore, reviews_from_users, reviews_from_critics FROM imdb_modeled.Movies WHERE usa_gross_income IS NOT NULL and worlwide_gross_income IS NOT NULL'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    query_results | 'Write log Input' >> WriteToText('input.txt')

    # apply ParDo to format directors birth year and death years to be ints
    formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo(
        FormatYearsFn())

    # write PCollection to log file
    formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH +
                                                             'output.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Movies_Beam_DF'
    schema_id = 'imdb_title_id:STRING, title:STRING, original_title:STRING, year:INTEGER, genre:STRING, duration:INTEGER, country:STRING, language:STRING, director:STRING, writer:STRING, production_company:STRING, actors:STRING, description:STRING, avg_votes:FLOAT, votes:INTEGER, budget_currency:STRING, budget:INTEGER, usa_gross_income:INTEGER, worlwide_gross_income_currency:STRING, worlwide_gross_income:INTEGER, metascore:FLOAT, reviews_from_users:FLOAT, reviews_from_critics:FLOAT'

    # write PCollection to new BQ table
    formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'swift-area-266618'  # change to your project id
    BUCKET = 'gs://nullbusters_data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'directors'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT name, birth_name, height, bio, birth_details, birth_year, place_of_birth, death_details, death_year, spouses, divorces, children, known_for_titles, imdb_title_id, director_name_id, category, reason_of_death FROM imdb_modeled.Directors WHERE birth_year IS NOT NULL AND death_year IS NOT NULL'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    query_results | 'Write log Input' >> WriteToText('input.txt')

    # apply ParDo to format directors birth year and death years to be ints
    formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo(
        FormatYearsFn())

    # write PCollection to log file
    formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH +
                                                             'output.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Directors_Beam_DF'
    schema_id = 'name:STRING,birth_name:STRING,height:FLOAT,bio:STRING,birth_details:STRING,birth_year:INTEGER,place_of_birth:STRING,death_details:STRING,death_year:INTEGER,spouses:INTEGER,divorces:INTEGER,children:STRING,known_for_titles:STRING,imdb_title_id:STRING,category:STRING,reason_of_death:STRING,director_name_id:STRING'

    # write PCollection to new BQ table
    formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
Example #34
0
  def test_direct_runner_metrics(self):
    from apache_beam.metrics.metric import Metrics

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    runner = DirectRunner()
    p = Pipeline(runner,
                 options=PipelineOptions(self.default_properties))
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> beam.ParDo(MyDoFn()))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))
    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))
Example #35
0
 def run(self, transform, options=None):
   """Run the given transform or callable with this runner.
   """
   # Imported here to avoid circular dependencies.
   # pylint: disable=wrong-import-order, wrong-import-position
   from apache_beam import PTransform
   from apache_beam.pvalue import PBegin
   from apache_beam.pipeline import Pipeline
   p = Pipeline(runner=self, options=options)
   if isinstance(transform, PTransform):
     p | transform
   else:
     transform(PBegin(p))
   return p.run()
Example #36
0
 def run(self, transform, options=None):
   """Run the given transform or callable with this runner.
   """
   # Imported here to avoid circular dependencies.
   # pylint: disable=wrong-import-order, wrong-import-position
   from apache_beam import PTransform
   from apache_beam.pvalue import PBegin
   from apache_beam.pipeline import Pipeline
   p = Pipeline(runner=self, options=options)
   if isinstance(transform, PTransform):
     p | transform
   else:
     transform(PBegin(p))
   return p.run()
Example #37
0
def run():
    PROJECT_ID = 'spry-cosine-266801'
    BUCKET = 'gs://icyhot-pack_beam'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'location-df'  # dataflow does not like '_' or special characters
    google_cloud_options.staging_location = BUCKET + '/staging'  #req*
    google_cloud_options.temp_location = BUCKET + '/temp'  #req*
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT id, province_state, country_region FROM covid19_jhu_csse_modeled.location_id'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # format US
    formatted_us_pcoll = query_results | 'Format US' >> beam.ParDo(
        FormatUSFn())

    # write PCollection to log file
    formatted_us_pcoll | 'Write log 1' >> WriteToText(DIR_PATH +
                                                      'formatted_us_pcoll.txt')

    dataset_id = 'covid19_jhu_csse_modeled'
    table_id = 'location_id_Beam_DF'
    schema_id = 'id:INTEGER, province_state:STRING, country_region:STRING'

    # write PCollection to new BQ table
    formatted_us_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
def run():

    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'location'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    sql = 'SELECT * FROM covid_19_modeled.Location_SQL_1'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # extract city from state
    state_pcoll = query_results | 'Format State' >> beam.ParDo(FormatStateFn())

    grouped_pcoll = state_pcoll | 'Group Locations' >> beam.GroupByKey()

    unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo(
        RemoveDuplicatesFn())

    dataset_id = 'covid_19_modeled'
    table_id = 'Location_Beam_DF'
    schema_id = 'id:INTEGER,city:STRING,state:STRING,country:STRING,latitude:NUMERIC,longitude:NUMERIC,fips:INTEGER,admin2:STRING,combined_key:STRING'

    # write PCollection to BQ table
    unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
Example #39
0
  def run_async(self, transform, options=None):
    """Run the given transform or callable with this runner.

    May return immediately, executing the pipeline in the background.
    The returned result object can be queried for progress, and
    `wait_until_finish` may be called to block until completion.
    """
    # Imported here to avoid circular dependencies.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam import PTransform
    from apache_beam.pvalue import PBegin
    from apache_beam.pipeline import Pipeline
    p = Pipeline(runner=self, options=options)
    if isinstance(transform, PTransform):
      p | transform
    else:
      transform(PBegin(p))
    return p.run()