Example #1
0
    def test_metric_filter_step_matching(self):
        name = MetricName('ns1', 'name1')
        filter = MetricsFilter().with_step('Step1')

        key = MetricKey('Step1', name)
        self.assertTrue(MetricResults.matches(filter, key))

        key = MetricKey('Step10', name)
        self.assertFalse(MetricResults.matches(filter, key))

        key = MetricKey('Step10/Step1', name)
        self.assertTrue(MetricResults.matches(filter, key))

        key = MetricKey('Top1/Outer1/Inner1', name)

        filter = MetricsFilter().with_step('Top1/Outer1/Inner1')
        self.assertTrue(MetricResults.matches(filter, key))

        filter = MetricsFilter().with_step('Top1/Outer1')
        self.assertTrue(MetricResults.matches(filter, key))

        filter = MetricsFilter().with_step('Outer1/Inner1')
        self.assertTrue(MetricResults.matches(filter, key))

        filter = MetricsFilter().with_step('Top1/Inner1')
        self.assertFalse(MetricResults.matches(filter, key))
def word_count_with_metrics(text_input,
                            dataflow_pipeline,
                            text_output=beam_output):
    word_count(
        text_input=text_input,
        text_output=text_output,
        dataflow_pipeline=dataflow_pipeline,
    )
    result = dataflow_pipeline.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, "has_job") or result.has_job  # direct runner
        ):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name("empty_lines")
        query_result = result.metrics().query(empty_lines_filter)
        if query_result["counters"]:
            empty_lines_counter = query_result["counters"][0]
            logging.info("number of empty lines: %d",
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name("word_len_dist")
        query_result = result.metrics().query(word_lengths_filter)
        if query_result["distributions"]:
            word_lengths_dist = query_result["distributions"][0]
            logging.info("average word length: %d",
                         word_lengths_dist.result.mean)
Example #3
0
    def test_counted_metrics(self):
        pipeline = TestPipeline()
        examples = [1, 5, 3, 10]
        pcoll = pipeline | 'start' >> beam.Create(examples)
        _ = pcoll | base.RunInference(FakeModelHandler())
        run_result = pipeline.run()
        run_result.wait_until_finish()

        metric_results = (run_result.metrics().query(
            MetricsFilter().with_name('num_inferences')))
        num_inferences_counter = metric_results['counters'][0]
        self.assertEqual(num_inferences_counter.committed, 4)

        inference_request_batch_size = run_result.metrics().query(
            MetricsFilter().with_name('inference_request_batch_size'))
        self.assertTrue(inference_request_batch_size['distributions'])
        self.assertEqual(
            inference_request_batch_size['distributions'][0].result.sum, 4)
        inference_request_batch_byte_size = run_result.metrics().query(
            MetricsFilter().with_name('inference_request_batch_byte_size'))
        self.assertTrue(inference_request_batch_byte_size['distributions'])
        self.assertGreaterEqual(
            inference_request_batch_byte_size['distributions'][0].result.sum,
            len(pickle.dumps(examples)))
        inference_request_batch_byte_size = run_result.metrics().query(
            MetricsFilter().with_name('model_byte_size'))
        self.assertTrue(inference_request_batch_byte_size['distributions'])
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' %
                                           (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.committed)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.committed.mean)
Example #5
0
  def test_model_use_and_query_metrics(self):
    """DebuggingWordCount example snippets."""

    import re

    p = TestPipeline()  # Use TestPipeline for testing.
    words = p | beam.Create(['albert', 'sam', 'mark', 'sarah',
                             'swati', 'daniel', 'andrea'])

    # pylint: disable=unused-variable
    # [START metrics_usage_example]
    class FilterTextFn(beam.DoFn):
      """A DoFn that filters for a specific key based on a regex."""

      def __init__(self, pattern):
        self.pattern = pattern
        # A custom metric can track values in your pipeline as it runs. Create
        # custom metrics to count unmatched words, and know the distribution of
        # word lengths in the input PCollection.
        self.word_len_dist = Metrics.distribution(self.__class__,
                                                  'word_len_dist')
        self.unmatched_words = Metrics.counter(self.__class__,
                                               'unmatched_words')

      def process(self, element):
        word = element
        self.word_len_dist.update(len(word))
        if re.match(self.pattern, word):
          yield element
        else:
          self.unmatched_words.inc()

    filtered_words = (
        words | 'FilterText' >> beam.ParDo(FilterTextFn('s.*')))
    # [END metrics_usage_example]
    # pylint: enable=unused-variable

    # [START metrics_check_values_example]
    result = p.run()
    result.wait_until_finish()

    custom_distribution = result.metrics().query(
        MetricsFilter().with_name('word_len_dist'))['distributions']
    custom_counter = result.metrics().query(
        MetricsFilter().with_name('unmatched_words'))['counters']

    if custom_distribution:
      logging.info('The average word length was %d',
                   custom_distribution[0].committed.mean)
    if custom_counter:
      logging.info('There were %d words that did not match the filter.',
                   custom_counter[0].committed)
    # [END metrics_check_values_example]

    # There should be 4 words that did not match
    self.assertEqual(custom_counter[0].committed, 4)
    # The shortest word is 3 characters, the longest is 6
    self.assertEqual(custom_distribution[0].committed.min, 3)
    self.assertEqual(custom_distribution[0].committed.max, 6)
Example #6
0
def run(argv=None, save_main_session=True):
    '''Main entry point; defines and runs the wordcount pipeline.'''
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    processed_users = (lines | 'splits' >> beam.Map(split_and_lower)
                       | 'noNum' >> beam.Map(no_num_format)
                       | 'formatOut' >> beam.Map(format_output))
    processed_users | 'uniqueUser' >> beam.Distinct(
    ) | 'writeUnique' >> WriteToText(known_args.output,
                                     file_name_suffix='.csv')

    schema = avro.schema.parse(open("user.avsc", "rb").read())
    processed_users | 'avro_write' >> beam.io.avroio.WriteToAvro(
        'output_avro', schema, file_name_suffix='.avro')

    reader = DataFileReader(open("output_avro-00000-of-00001.avro", "rb"),
                            DatumReader())
    for user in reader:
        print user
    reader.close()

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
        empty_lines_counter = query_result['counters'][0]
        logging.info('number of empty lines: %d', empty_lines_counter.result)

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
        word_lengths_dist = query_result['distributions'][0]
        logging.info('average word length: %d', word_lengths_dist.result.mean)
Example #7
0
    def test_user_counter_using_pardo(self):
        class SomeDoFn(beam.DoFn):
            """A custom dummy DoFn using yield."""
            static_counter_elements = metrics.Metrics.counter(
                "SomeDoFn", 'metrics_static_counter_element')

            def __init__(self):
                self.user_counter_elements = metrics.Metrics.counter(
                    self.__class__, 'metrics_user_counter_element')

            def process(self, element):
                self.static_counter_elements.inc(2)
                self.user_counter_elements.inc()
                distro = Metrics.distribution(self.__class__, 'element_dist')
                distro.update(element)
                yield element

        pipeline = TestPipeline()
        nums = pipeline | 'Input' >> beam.Create([1, 2, 3, 4])
        results = nums | 'ApplyPardo' >> beam.ParDo(SomeDoFn())
        assert_that(results, equal_to([1, 2, 3, 4]))

        res = pipeline.run()
        res.wait_until_finish()

        # Verify static counter.
        metric_results = (res.metrics().query(MetricsFilter().with_metric(
            SomeDoFn.static_counter_elements)))
        outputs_static_counter = metric_results['counters'][0]

        self.assertEqual(outputs_static_counter.key.metric.name,
                         'metrics_static_counter_element')
        self.assertEqual(outputs_static_counter.committed, 8)

        # Verify user counter.
        metric_results = (res.metrics().query(
            MetricsFilter().with_name('metrics_user_counter_element')))
        outputs_user_counter = metric_results['counters'][0]

        self.assertEqual(outputs_user_counter.key.metric.name,
                         'metrics_user_counter_element')
        self.assertEqual(outputs_user_counter.committed, 4)

        # Verify user distribution counter.
        metric_results = res.metrics().query()
        matcher = MetricResultMatcher(
            step='ApplyPardo',
            namespace=hc.contains_string('SomeDoFn'),
            name='element_dist',
            committed=DistributionMatcher(
                sum_value=hc.greater_than_or_equal_to(0),
                count_value=hc.greater_than_or_equal_to(0),
                min_value=hc.greater_than_or_equal_to(0),
                max_value=hc.greater_than_or_equal_to(0)))
        hc.assert_that(metric_results['distributions'],
                       hc.contains_inanyorder(matcher))
Example #8
0
    def test_metric_filter_name_matching(self):
        filter = MetricsFilter().with_name('name1').with_namespace('ns1')
        name = MetricName('ns1', 'name1')
        key = MetricKey('step1', name)
        self.assertTrue(MetricResults.matches(filter, key))

        filter = MetricsFilter().with_name('name1')
        name = MetricName('ns1', 'name1')
        key = MetricKey('step1', name)
        self.assertTrue(MetricResults.matches(filter, key))
Example #9
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  known_args = PipelineOptions().view_as(WordcountOptions)
  pipeline_options = PipelineOptions()
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)


  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each word.
  def count_ones(word_ones):
    (word, ones) = word_ones
    return (word, sum(ones))

  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(count_ones))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)

  result = p.run()
  result.wait_until_finish()

  # Do not query metrics when creating a template which doesn't run
  if (not hasattr(result, 'has_job')    # direct runner
      or result.has_job):               # not just a template creation
    empty_lines_filter = MetricsFilter().with_name('empty_lines')
    query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
      empty_lines_counter = query_result['counters'][0]
      logging.info('number of empty lines: %d', empty_lines_counter.result)

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
      word_lengths_dist = query_result['distributions'][0]
      logging.info('average word length: %d', word_lengths_dist.result.mean)
Example #10
0
def get_distributions_metric(result, counter_name):
    metrics_filter = MetricsFilter().with_name(counter_name)
    query_result = result.metrics().query(metrics_filter)
    if query_result['distributions']:
        return query_result['distributions'][0].committed
    else:
        return None
Example #11
0
    def test_user_counter_using_pardo(self):
        class SomeDoFn(beam.DoFn):
            """A custom dummy DoFn using yield."""
            def __init__(self):
                self.user_counter_elements = metrics.Metrics.counter(
                    self.__class__, 'metrics_user_counter_element')

            def process(self, element):
                self.user_counter_elements.inc()
                yield element

        pipeline = TestPipeline()
        nums = pipeline | 'Input' >> beam.Create([1, 2, 3, 4])
        results = nums | 'ApplyPardo' >> beam.ParDo(SomeDoFn())
        assert_that(results, equal_to([1, 2, 3, 4]))

        res = pipeline.run()
        res.wait_until_finish()
        metric_results = (res.metrics().query(
            MetricsFilter().with_name('metrics_user_counter_element')))
        outputs_counter = metric_results['counters'][0]

        self.assertEqual(outputs_counter.key.metric.name,
                         'metrics_user_counter_element')
        self.assertEqual(outputs_counter.committed, 4)
Example #12
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', help='Input file to process')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results')
    args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | 'read' >> ReadFromText(args.input)
    words = (lines
             | 'split' >> beam.ParDo(WordExtractingDoFn())
             | 'map' >> beam.Map(lambda x: (x, 1))
             | 'group' >> beam.GroupByKey()
             | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    words | 'write' >> WriteToText(args.output)

    result = p.run()
    result.wait_until_finish()

    if not hasattr(result, 'has_job'):
        words = MetricsFilter().with_name('words')
        query_result = result.metrics().query(words)
        print(query_result['counters'][0].result)
Example #13
0
    def test_spanner_write_mutation_groups(self, mock_batch_snapshot_class,
                                           mock_batch_checkout):
        ks = spanner.KeySet(keys=[[1233], [1234]])
        mutation_groups = [
            MutationGroup([
                WriteMutation.insert("roles", ("key", "rolename"),
                                     [('9001233', "mutations-inset-1233")]),
                WriteMutation.insert("roles", ("key", "rolename"),
                                     [('9001234', "mutations-inset-1234")])
            ]),
            MutationGroup([
                WriteMutation.update(
                    "roles", ("key", "rolename"),
                    [('9001234', "mutations-inset-9001233-updated")])
            ]),
            MutationGroup([WriteMutation.delete("roles", ks)])
        ]

        p = TestPipeline()
        _ = (p
             | beam.Create(mutation_groups)
             | WriteToSpanner(project_id=TEST_PROJECT_ID,
                              instance_id=TEST_INSTANCE_ID,
                              database_id=_generate_database_name(),
                              max_batch_size_bytes=100))
        res = p.run()
        res.wait_until_finish()

        metric_results = res.metrics().query(
            MetricsFilter().with_name('SpannerBatches'))
        batches_counter = metric_results['counters'][0]

        self.assertEqual(batches_counter.committed, 3)
        self.assertEqual(batches_counter.attempted, 3)
Example #14
0
    def test_bigtable_write(self):
        number = self.number
        pipeline_args = self.test_pipeline.options_list
        pipeline_options = PipelineOptions(pipeline_args)

        with beam.Pipeline(options=pipeline_options) as pipeline:
            config_data = {
                'project_id': self.project,
                'instance_id': self.instance,
                'table_id': self.table
            }
            _ = (pipeline
                 | 'Generate Direct Rows' >> GenerateTestRows(
                     number, **config_data))

        assert pipeline.result.state == PipelineState.DONE

        read_rows = self.table.read_rows()
        assert len([_ for _ in read_rows]) == number

        if not hasattr(pipeline.result, 'has_job') or pipeline.result.has_job:
            read_filter = MetricsFilter().with_name('Written Row')
            query_result = pipeline.result.metrics().query(read_filter)
            if query_result['counters']:
                read_counter = query_result['counters'][0]

                logging.info('Number of Rows: %d', read_counter.committed)
                assert read_counter.committed == number
def compute_stats(
    input_handle,
    stats_path,
    max_rows=None,
    for_eval=False,
    pipeline_args=None,
    publish_to_bq=None,
    metrics_dataset=None,
    metrics_table=None,
    project=None):
  """Computes statistics on the input data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    stats_path: Directory in which stats are materialized.
    max_rows: Number of rows to query from BigQuery
    for_eval: Query for eval set rows from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
  namespace = metrics_table
  pipeline = beam.Pipeline(argv=pipeline_args)
  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        namespace=namespace,
        filters=MetricsFilter().with_namespace(namespace),
    )

  query = taxi.make_sql(
      table_name=input_handle, max_rows=max_rows, for_eval=for_eval)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(
          query=query, project=project, use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(namespace))
      | 'ConvertToTFDVInput' >> beam.Map(
          lambda x:
          {key: np.asarray([x[key]])
           for key in x if x[key] is not None}))

  _ = (
      raw_data
      | 'GenerateStatistics' >> tfdv.GenerateStatistics()
      | 'Measure time: End' >> beam.ParDo(MeasureTime(namespace))
      | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
          stats_path,
          shard_name_template='',
          coder=beam.coders.ProtoCoder(
              statistics_pb2.DatasetFeatureStatisticsList)))
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)
def main(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')

    parser.add_argument('project_id', help='Google Cloud project ID')
    parser.add_argument('subscription_name', help='Pub/Sub subscription name')

    known_args, pipeline_args = parser.parse_known_args(argv)

    dataflow_sub(known_args.project_id, pipeline_args.subscription_name)

    p = build_pipeline(
        project_id=known_args.project_id,
        input_subscription=known_args.subscription_name,
        output_subscription=known_args.output,
        pipeline_args=pipeline_args,
    )

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
Example #17
0
def get_pipeline_metric(results, metric_name, index=0, result_type='counters'):
    metric_filter = MetricsFilter().with_name(metric_name)
    query_result = results.metrics().query(metric_filter)
    try:
        return query_result[result_type][index].committed
    except IndexError:
        logging.info('No key in metrics for %s at index %s, returning 0',
                     metric_name, index)
        return 0
Example #18
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--kind',
                        dest='kind',
                        required=True,
                        help='Datastore Kind')
    parser.add_argument('--namespace',
                        dest='namespace',
                        help='Datastore Namespace')
    parser.add_argument('--ancestor',
                        dest='ancestor',
                        default='root',
                        help='The ancestor key name for all entities.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--read_only',
                        action='store_true',
                        help='Read an existing dataset, do not write first')
    parser.add_argument(
        '--num_shards',
        dest='num_shards',
        type=int,
        # If the system should choose automatically.
        default=0,
        help='Number of output shards')

    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)

    # Write to Datastore if `read_only` options is not specified.
    if not known_args.read_only:
        write_to_datastore(gcloud_options.project, known_args,
                           pipeline_options)

    # Read entities from Datastore.
    result = read_from_datastore(gcloud_options.project, known_args,
                                 pipeline_options)

    empty_lines_filter = MetricsFilter().with_name('empty_lines')
    query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
        empty_lines_counter = query_result['counters'][0]
        logging.info('number of empty lines: %d',
                     empty_lines_counter.committed)
Example #19
0
  def testTelemetry(self, decode_examples: bool):
    example_path = self._get_output_data_dir('examples')
    self._prepare_multihead_examples(example_path)
    model_path = self._get_output_data_dir('model')
    self._build_multihead_model(model_path)
    inference_spec_type = model_spec_pb2.InferenceSpecType(
        saved_model_spec=model_spec_pb2.SavedModelSpec(
            model_path=model_path, signature_name=['classify_sum']))
    pipeline = self._make_beam_pipeline()
    _ = (
        pipeline
        | 'ReadExamples' >> beam.io.ReadFromTFRecord(example_path)
        | 'MaybeDecode' >> beam.Map(
            lambda x: x if decode_examples else tf.train.Example.FromString(x))
        | 'RunInference' >> run_inference.RunInferenceImpl(inference_spec_type))
    run_result = pipeline.run()
    run_result.wait_until_finish()

    num_inferences = run_result.metrics().query(
        MetricsFilter().with_name('num_inferences'))
    self.assertTrue(num_inferences['counters'])
    self.assertEqual(num_inferences['counters'][0].result, 2)
    inference_request_batch_size = run_result.metrics().query(
        MetricsFilter().with_name('inference_request_batch_size'))
    self.assertTrue(inference_request_batch_size['distributions'])
    self.assertEqual(
        inference_request_batch_size['distributions'][0].result.sum, 2)
    inference_request_batch_byte_size = run_result.metrics().query(
        MetricsFilter().with_name('inference_request_batch_byte_size'))
    self.assertTrue(inference_request_batch_byte_size['distributions'])
    self.assertEqual(
        inference_request_batch_byte_size['distributions'][0].result.sum,
        sum(element.ByteSize() for element in self._multihead_examples))
    inference_batch_latency_micro_secs = run_result.metrics().query(
        MetricsFilter().with_name('inference_batch_latency_micro_secs'))
    self.assertTrue(inference_batch_latency_micro_secs['distributions'])
    self.assertGreaterEqual(
        inference_batch_latency_micro_secs['distributions'][0].result.sum, 0)
    load_model_latency_milli_secs = run_result.metrics().query(
        MetricsFilter().with_name('load_model_latency_milli_secs'))
    self.assertTrue(load_model_latency_milli_secs['distributions'])
    self.assertGreaterEqual(
        load_model_latency_milli_secs['distributions'][0].result.sum, 0)
Example #20
0
    def test_direct_runner_metrics(self):
        class MyDoFn(beam.DoFn):
            def start_bundle(self):
                count = Metrics.counter(self.__class__, 'bundles')
                count.inc()

            def finish_bundle(self):
                count = Metrics.counter(self.__class__, 'finished_bundles')
                count.inc()

            def process(self, element):
                gauge = Metrics.gauge(self.__class__, 'latest_element')
                gauge.set(element)
                count = Metrics.counter(self.__class__, 'elements')
                count.inc()
                distro = Metrics.distribution(self.__class__, 'element_dist')
                distro.update(element)
                return [element]

        p = Pipeline(DirectRunner())
        pcoll = (p | beam.Create([1, 2, 3, 4, 5], reshuffle=False)
                 | 'Do' >> beam.ParDo(MyDoFn()))
        assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
        result = p.run()
        result.wait_until_finish()
        metrics = result.metrics().query(MetricsFilter().with_step('Do'))
        namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__)

        hc.assert_that(
            metrics['counters'],
            hc.contains_inanyorder(
                MetricResult(
                    MetricKey('Do', MetricName(namespace, 'elements')), 5, 5),
                MetricResult(MetricKey('Do', MetricName(namespace, 'bundles')),
                             1, 1),
                MetricResult(
                    MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                    1, 1)))

        hc.assert_that(
            metrics['distributions'],
            hc.contains_inanyorder(
                MetricResult(
                    MetricKey('Do', MetricName(namespace, 'element_dist')),
                    DistributionResult(DistributionData(15, 5, 1, 5)),
                    DistributionResult(DistributionData(15, 5, 1, 5)))))

        gauge_result = metrics['gauges'][0]
        hc.assert_that(
            gauge_result.key,
            hc.equal_to(
                MetricKey('Do', MetricName(namespace, 'latest_element'))))
        hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
        hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
Example #21
0
def run(input_file, output_file):
    """Main entry point; defines and runs the wordcount pipeline."""

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(input_file)

    # Count the occurrences of each word.
    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' %
                                           (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(output_file)

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    result.wait_until_finish()

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
        word_lengths_dist = query_result['distributions'][0]
        print 'average word length: %d', word_lengths_dist.committed.mean
    num_words_filer = MetricsFilter().with_name('num_words')
    query_result = result.metrics().query(num_words_filer)
    if query_result['counters']:
        total_words = query_result['counters'][0]
        print 'Number of total words: ' + str(total_words.committed)
Example #22
0
    def test_timing_metrics(self):
        pipeline = TestPipeline()
        examples = [1, 5, 3, 10]
        pcoll = pipeline | 'start' >> beam.Create(examples)
        fake_clock = FakeClock()
        _ = pcoll | base.RunInference(FakeModelHandler(clock=fake_clock),
                                      clock=fake_clock)
        res = pipeline.run()
        res.wait_until_finish()

        metric_results = (res.metrics().query(
            MetricsFilter().with_name('inference_batch_latency_micro_secs')))
        batch_latency = metric_results['distributions'][0]
        self.assertEqual(batch_latency.result.count, 3)
        self.assertEqual(batch_latency.result.mean, 3000)

        metric_results = (res.metrics().query(
            MetricsFilter().with_name('load_model_latency_milli_secs')))
        load_model_latency = metric_results['distributions'][0]
        self.assertEqual(load_model_latency.result.count, 1)
        self.assertEqual(load_model_latency.result.mean, 500)
def get_counter_values(pipeline_result, names, wait_until_finish=True):
    if wait_until_finish:
        pipeline_result.wait_until_finish()
    counter_values = dict()
    for name in names:
        counter = pipeline_result.metrics().query(
            MetricsFilter().with_name(name)
        )['counters']
        assert len(counter) <= 1
        if len(counter) == 1:
            counter_values[name] = counter[0].committed
    return counter_values
    def testWriteSplitCounter(self):
        count = 10

        def Pipeline(root):
            data = [tf.train.Example()] * count
            _ = (root
                 | beam.Create(data)
                 | base_example_gen_executor._WriteSplit(
                     self._output_data_dir))

        run_result = direct_runner.DirectRunner().run(Pipeline)
        run_result.wait_until_finish()

        num_instances = run_result.metrics().query(
            MetricsFilter().with_name('num_instances'))
        self.assertTrue(num_instances['counters'])
        self.assertEqual(len(num_instances['counters']), 1)
        self.assertEqual(num_instances['counters'][0].result, count)
Example #25
0
    def testWriteSplitCounter_WithTFRECORDS_GZIP(self):
        count = 10

        def Pipeline(root):
            data = [tf.train.Example()] * count
            _ = (root
                 | beam.Create(data)
                 | write_split.WriteSplit(
                     self._output_data_dir,
                     example_gen_pb2.FORMAT_TFRECORDS_GZIP))

        run_result = direct_runner.DirectRunner().run(Pipeline)
        run_result.wait_until_finish()

        num_instances = run_result.metrics().query(
            MetricsFilter().with_name('num_instances'))

        self.assertTrue(
            fileio.exists(
                os.path.join(self._output_data_dir,
                             'data_tfrecord-00000-of-00001.gz')))
        self.assertTrue(num_instances['counters'])
        self.assertEqual(len(num_instances['counters']), 1)
        self.assertEqual(num_instances['counters'][0].result, count)
Example #26
0
def process_tfma(schema_file,
                 big_query_table=None,
                 eval_model_dir=None,
                 max_eval_rows=None,
                 pipeline_args=None,
                 publish_to_bq=False,
                 project=None,
                 metrics_table=None,
                 metrics_dataset=None):
  """Runs a batch job to evaluate the eval_model against the given input.

  Args:
  schema_file: A file containing a text-serialized Schema that describes the
      eval data.
  big_query_table: A BigQuery table name specified as DATASET.TABLE which
      should be the input for evaluation. This can only be set if input_csv is
      None.
  eval_model_dir: A directory where the eval model is located.
  max_eval_rows: Number of rows to query from BigQuery.
  pipeline_args: additional DataflowRunner or DirectRunner args passed to
  the beam pipeline.
  publish_to_bq:
  project:
  metrics_dataset:
  metrics_table:

  Raises:
  ValueError: if input_csv and big_query_table are not specified correctly.
  """

  if big_query_table is None:
    raise ValueError(
        '--big_query_table should be provided.')

  slice_spec = [
      tfma.slicer.SingleSliceSpec(),
      tfma.slicer.SingleSliceSpec(columns=['trip_start_hour'])
  ]
  metrics_namespace = metrics_table

  schema = taxi.read_schema(schema_file)

  eval_shared_model = tfma.default_eval_shared_model(
      eval_saved_model_path=eval_model_dir,
      add_metrics_callbacks=[
          tfma.post_export_metrics.calibration_plot_and_prediction_histogram(),
          tfma.post_export_metrics.auc_plots()
      ])

  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        filters=MetricsFilter().with_namespace(metrics_namespace)
    )

  pipeline = beam.Pipeline(argv=pipeline_args)

  query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project,
                                           use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace))
      | 'CleanData' >> beam.Map(lambda x: (
          taxi.clean_raw_data_dict(x, raw_feature_spec))))

  # Examples must be in clean tf-example format.
  coder = taxi.make_proto_coder(schema)
  # Prepare arguments for Extract, Evaluate and Write steps
  extractors = tfma.default_extractors(
      eval_shared_model=eval_shared_model,
      slice_spec=slice_spec,
      desired_batch_size=None,
      materialize=False)

  evaluators = tfma.default_evaluators(
      eval_shared_model=eval_shared_model,
      desired_batch_size=None,
      num_bootstrap_samples=1)
  _ = (
      raw_data
      | 'ToSerializedTFExample' >> beam.Map(coder.encode)
      | 'Extract Results' >> tfma.InputsToExtracts()
      | 'Extract and evaluate' >> tfma.ExtractAndEvaluate(
          extractors=extractors,
          evaluators=evaluators)
      | 'Map Evaluations to PCollection' >> MapEvalToPCollection()
      | 'Measure time: End' >> beam.ParDo(
          MeasureTime(metrics_namespace))
  )
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)
Example #27
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Ensure that the experiment flag is set explicitly by the user.
  debug_options = pipeline_options.view_as(DebugOptions)
  use_fn_api = (
      debug_options.experiments and 'beam_fn_api' in debug_options.experiments)
  assert use_fn_api, 'Enable beam_fn_api experiment, in order run this example.'

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(known_args.input)

  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group_and_sum' >> beam.CombinePerKey(sum))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %s' % (word, count)

  # pylint: disable=unused-variable
  output = counts | 'format' >> beam.Map(format_result)

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned

  # TODO(BEAM-2887): Enable after the issue is fixed.
  # output | 'write' >> WriteToText(known_args.output)

  result = p.run()
  result.wait_until_finish()

  # Do not query metrics when creating a template which doesn't run
  if (not hasattr(result, 'has_job')    # direct runner
      or result.has_job):               # not just a template creation
    empty_lines_filter = MetricsFilter().with_name('empty_lines')
    query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
      empty_lines_counter = query_result['counters'][0]
      logging.info('number of empty lines: %d', empty_lines_counter.committed)

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
      word_lengths_dist = query_result['distributions'][0]
      logging.info('average word length: %d', word_lengths_dist.committed.mean)
Example #28
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None,
                   publish_to_bq=False,
                   project=None,
                   metrics_table=None,
                   metrics_dataset=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    namespace = metrics_table
    metrics_monitor = None
    if publish_to_bq:
        metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=project,
            bq_table=metrics_table,
            bq_dataset=metrics_dataset,
            namespace=namespace,
            filters=MetricsFilter().with_namespace(namespace))
    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    pipeline = beam.Pipeline(argv=pipeline_args)
    with tft_beam.Context(temp_dir=working_dir):
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> ReadFromBigQuery(
                query=query, project=project, use_standard_sql=True)
            | 'Measure time: start' >> beam.ParDo(MeasureTime(namespace)))
        decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                    raw_feature_spec=raw_feature_spec)

        if transform_dir is None:
            decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
            transform_fn = (
                (decoded_data, raw_data_metadata) |
                ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

            _ = (
                transform_fn |
                ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)))
        else:
            transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

        # Shuffling the data before materialization will improve Training
        # effectiveness downstream. Here we shuffle the raw_data (as opposed to
        # decoded data) since it has a compact representation.
        shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
        )

        decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
        (transformed_data, transformed_metadata) = (
            ((decoded_data, raw_data_metadata), transform_fn)
            | 'Transform' >> tft_beam.TransformDataset())

        coder = example_proto_coder.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'Measure time: end' >> beam.ParDo(MeasureTime(namespace))
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(working_dir, outfile_prefix),
                 file_name_suffix='.gz'))
    result = pipeline.run()
    result.wait_until_finish()
    if metrics_monitor:
        metrics_monitor.publish_metrics(result)
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--format',
                        dest='format',
                        default='text',
                        help='Supported output file formats: %s.' % FORMATS)
    known_args, pipeline_args = parser.parse_known_args(argv)

    if known_args.format not in FORMATS:
        raise ValueError('--format should be one of: %s' % FORMATS)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_text(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    # Format the counts into a PCollection of dictionary strings.

    def format_dict(word_count):
        (word, count) = word_count
        row = dict(zip(HEADER, [word, count]))
        return row

    if known_args.format == 'text':
        output = counts | 'format text' >> beam.Map(format_text)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write text' >> WriteToText(known_args.output)
    elif known_args.format == 'avro':
        output = counts | 'format avro' >> beam.Map(format_dict)

        schema = avro.schema.parse(json.dumps(AVRO_SCHEMA))

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write avro' >> WriteToAvro(
            file_path_prefix=known_args.output,
            schema=schema,
            codec=DEFAULT_CODEC)
    else:
        output = counts | 'format parquet' >> beam.Map(format_dict)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write parquet' >> WriteToParquet(
            file_path_prefix=known_args.output,
            schema=PARQUET_SCHEMA,
            codec=DEFAULT_CODEC)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
Example #30
0
        if y < 200.0:
            self.word_counter.set(y)
            yield element


class Print_Row(beam.DoFn):
    def process(self, element):
        print(element)


# Running locally in the DirectRunner.
with beam.Pipeline() as pipeline:
    (pipeline
     | 'Read lines' >> beam.io.ReadFromText(file_in)
     | 'Par Do' >> beam.ParDo(Head())
     | 'Par D1' >> beam.ParDo(Split())
     | 'Par D2' >> beam.ParDo(Filter())
     | 'Par D3' >> beam.Map(print))
pr = pipeline.run()
pr.wait_until_finish()
empty_lines_filter = MetricsFilter().with_name('empty_lines')
query_result = pr.metrics().query(empty_lines_filter)
print(query_result)

word_lengths_filter = MetricsFilter().with_name('word_lengths')
query_result = pr.metrics().query(word_lengths_filter)
print(query_result)

tot_len_filter = MetricsFilter().with_name('total_words')
query_result = pr.metrics().query(tot_len_filter)
print(query_result)