コード例 #1
0
    def test_compute_top_sessions(self):
        p = df.Pipeline('DirectPipelineRunner')
        edits = p | df.Create('create', self.EDITS)
        result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

        df.assert_that(result, df.equal_to(self.EXPECTED))
        p.run()
コード例 #2
0
  def test_compute_top_sessions(self):
    p = df.Pipeline('DirectPipelineRunner')
    edits = p | df.Create('create', self.EDITS)
    result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

    df.assert_that(result, df.equal_to(self.EXPECTED))
    p.run()
コード例 #3
0
 def test_basics(self):
     p = df.Pipeline('DirectPipelineRunner')
     rows = (p | df.Create('create', [{
         'month': 1,
         'day': 1,
         'tornado': False
     }, {
         'month': 1,
         'day': 2,
         'tornado': True
     }, {
         'month': 1,
         'day': 3,
         'tornado': True
     }, {
         'month': 2,
         'day': 1,
         'tornado': True
     }]))
     results = bigquery_tornadoes.count_tornadoes(rows)
     df.assert_that(
         results,
         df.equal_to([{
             'month': 1,
             'tornado_count': 2
         }, {
             'month': 2,
             'tornado_count': 1
         }]))
     p.run()
コード例 #4
0
  def test_create_groups(self):
    p = df.Pipeline('DirectPipelineRunner')

    group_ids_pcoll = p | df.Create('create_group_ids', ['A', 'B', 'C'])
    corpus_pcoll = p | df.Create('create_corpus',
                                 [{'f': 'corpus1'},
                                  {'f': 'corpus2'},
                                  {'f': 'corpus3'}])
    words_pcoll = p | df.Create('create_words', [{'f': 'word1'},
                                                 {'f': 'word2'},
                                                 {'f': 'word3'}])
    ignore_corpus_pcoll = p | df.Create('create_ignore_corpus', ['corpus1'])
    ignore_word_pcoll = p | df.Create('create_ignore_word', ['word1'])

    groups = bigquery_side_input.create_groups(group_ids_pcoll, corpus_pcoll,
                                               words_pcoll, ignore_corpus_pcoll,
                                               ignore_word_pcoll)

    def group_matcher(actual):
      self.assertEqual(len(actual), 3)
      for group in actual:
        self.assertEqual(len(group), 3)
        self.assertTrue(group[1].startswith('corpus'))
        self.assertNotEqual(group[1], 'corpus1')
        self.assertTrue(group[2].startswith('word'))
        self.assertNotEqual(group[2], 'word1')

    df.assert_that(groups, group_matcher)
    p.run()
コード例 #5
0
 def test_basic(self):
   """Test that the correct result is returned for a simple dataset."""
   results = self._get_result_for_month(1)
   df.assert_that(
       results,
       df.equal_to([{'year': 2010, 'month': 1, 'day': 1, 'mean_temp': 3},
                    {'year': 2012, 'month': 1, 'day': 2, 'mean_temp': 3}]))
   results.pipeline.run()
コード例 #6
0
 def test_basics(self):
   p = df.Pipeline('DirectPipelineRunner')
   rows = (p | df.Create('create', [
       {'month': 1, 'day': 1, 'tornado': False},
       {'month': 1, 'day': 2, 'tornado': True},
       {'month': 1, 'day': 3, 'tornado': True},
       {'month': 2, 'day': 1, 'tornado': True}]))
   results = bigquery_tornadoes.count_tornadoes(rows)
   df.assert_that(results, df.equal_to([{'month': 1, 'tornado_count': 2},
                                        {'month': 2, 'tornado_count': 1}]))
   p.run()
コード例 #7
0
    def test_combine_per_key_with_custom_callable(self):
        """CombinePerKey using a custom function reducing iterables."""
        def multiply(values):
            result = 1
            for v in values:
                result *= v
            return result

        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(multiply))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 200), ('c', 100)]))
        result.pipeline.run()
コード例 #8
0
    def test_combine_per_key_with_callable(self):
        """CombinePerKey using a standard callable reducing iterables.

    A common case for Dataflow combiners is to sum (or max or min) over the
    values of each key. Such standard functions can be used directly as combiner
    functions. In fact, any function "reducing" an iterable to a single value
    can be used.
    """
        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(sum))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 30), ('c', 100)]))
        result.pipeline.run()
コード例 #9
0
  def test_combine_per_key_with_callable(self):
    """CombinePerKey using a standard callable reducing iterables.

    A common case for Dataflow combiners is to sum (or max or min) over the
    values of each key. Such standard functions can be used directly as combiner
    functions. In fact, any function "reducing" an iterable to a single value
    can be used.
    """
    result = (
        df.Pipeline(runner=df.runners.DirectPipelineRunner())
        | df.Create(CombinersTest.SAMPLE_DATA)
        | df.CombinePerKey(sum))

    df.assert_that(result, df.equal_to([('a', 6), ('b', 30), ('c', 100)]))
    result.pipeline.run()
コード例 #10
0
  def test_combine_per_key_with_custom_callable(self):
    """CombinePerKey using a custom function reducing iterables."""
    def multiply(values):
      result = 1
      for v in values:
        result *= v
      return result

    result = (
        df.Pipeline(runner=df.runners.DirectPipelineRunner())
        | df.Create(CombinersTest.SAMPLE_DATA)
        | df.CombinePerKey(multiply))

    df.assert_that(result, df.equal_to([('a', 6), ('b', 200), ('c', 100)]))
    result.pipeline.run()
コード例 #11
0
 def test_tfidf_transform(self):
   p = df.Pipeline('DirectPipelineRunner')
   uri_to_line = p | df.Create(
       'create sample',
       [('1.txt', 'abc def ghi'),
        ('2.txt', 'abc def'),
        ('3.txt', 'abc')])
   result = (
       uri_to_line
       | tfidf.TfIdf()
       | df.Map('flatten', lambda (word, (uri, tfidf)): (word, uri, tfidf)))
   df.assert_that(result, df.equal_to(EXPECTED_RESULTS))
   # Run the pipeline. Note that the assert_that above adds to the pipeline
   # a check that the result PCollection contains expected values. To actually
   # trigger the check the pipeline must be run.
   p.run()
コード例 #12
0
def run(argv=None):
    """Runs the debugging wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection, count the occurrences of
    # each word and filter by a list of words.
    filtered_words = (
        p | df.io.Read('read', df.io.TextFileSource(known_args.input))
        | CountWords()
        | df.ParDo('FilterText', FilterTextFn('Flourish|stomach')))

    # assert_that is a convenient PTransform that checks a PCollection has an
    # expected value. Asserts are best used in unit tests with small data sets but
    # is demonstrated here as a teaching tool.
    #
    # Note assert_that does not provide any output and that successful completion
    # of the Pipeline implies that the expectations were  met. Learn more at
    # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to
    # test your pipeline.
    df.assert_that(filtered_words,
                   df.equal_to([('Flourish', 3), ('stomach', 1)]))

    # Format the counts into a PCollection of strings and write the output using a
    # "Write" transform that has side effects.
    # pylint: disable=unused-variable
    output = (filtered_words
              | df.Map('format', lambda (word, c): '%s: %s' % (word, c))
              | df.io.Write('write', df.io.TextFileSink(known_args.output)))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
コード例 #13
0
def run(argv=None):
  """Runs the debugging wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  p = df.Pipeline(argv=pipeline_args)

  # Read the text file[pattern] into a PCollection, count the occurrences of
  # each word and filter by a list of words.
  filtered_words = (
      p | df.io.Read('read', df.io.TextFileSource(known_args.input))
      | CountWords() | df.ParDo('FilterText', FilterTextFn('Flourish|stomach')))

  # assert_that is a convenient PTransform that checks a PCollection has an
  # expected value. Asserts are best used in unit tests with small data sets but
  # is demonstrated here as a teaching tool.
  #
  # Note assert_that does not provide any output and that successful completion
  # of the Pipeline implies that the expectations were  met. Learn more at
  # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to
  # test your pipeline.
  df.assert_that(filtered_words, df.equal_to([('Flourish', 3), ('stomach', 1)]))

  # Format the counts into a PCollection of strings and write the output using a
  # "Write" transform that has side effects.
  # pylint: disable=unused-variable
  output = (filtered_words
            | df.Map('format', lambda (word, c): '%s: %s' % (word, c))
            | df.io.Write('write', df.io.TextFileSink(known_args.output)))

  # Actually run the pipeline (all operations above are deferred).
  p.run()
コード例 #14
0
  def test_pardo_side_input(self):
    p = df.Pipeline('DirectPipelineRunner')
    words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd'])

    # [START model_pardo_side_input]
    # Callable takes additional arguments.
    def filter_using_length(word, lower_bound, upper_bound=float('inf')):
      if lower_bound <= len(word) <= upper_bound:
        yield word

    # Construct a deferred side input.
    avg_word_len = words | df.Map(len) | df.CombineGlobally(df.combiners.MeanCombineFn())

    # Call with explicit side inputs.
    small_words = words | df.FlatMap('small', filter_using_length, 0, 3)

    # A single deferred side input.
    larger_than_average = words | df.FlatMap('large',
                                             filter_using_length,
                                             lower_bound=pvalue.AsSingleton(avg_word_len))

    # Mix and match.
    small_but_nontrivial = words | df.FlatMap(filter_using_length,
                                              lower_bound=2,
                                              upper_bound=pvalue.AsSingleton(avg_word_len))
    # [END model_pardo_side_input]

    df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc']))
    df.assert_that(larger_than_average, df.equal_to(['ccc', 'dddd']),
                   label='larger_than_average')
    df.assert_that(small_but_nontrivial, df.equal_to(['bb']),
                   label='small_but_not_trivial')
    p.run()
コード例 #15
0
ファイル: snippets_test.py プロジェクト: obulpathi/cloud
    def test_pardo_side_input(self):
        p = df.Pipeline("DirectPipelineRunner")
        words = p | df.Create("start", ["a", "bb", "ccc", "dddd"])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float("inf")):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = words | df.Map(len) | df.CombineGlobally(df.combiners.MeanCombineFn())

        # Call with explicit side inputs.
        small_words = words | df.FlatMap("small", filter_using_length, 0, 3)

        # A single deferred side input.
        larger_than_average = words | df.FlatMap(
            "large", filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len)
        )

        # Mix and match.
        small_but_nontrivial = words | df.FlatMap(
            filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton(avg_word_len)
        )
        # [END model_pardo_side_input]

        df.assert_that(small_words, df.equal_to(["a", "bb", "ccc"]))
        df.assert_that(larger_than_average, df.equal_to(["ccc", "dddd"]), label="larger_than_average")
        df.assert_that(small_but_nontrivial, df.equal_to(["bb"]), label="small_but_not_trivial")
        p.run()
コード例 #16
0
    def test_create_groups(self):
        p = df.Pipeline('DirectPipelineRunner')

        group_ids_pcoll = p | df.Create('create_group_ids', ['A', 'B', 'C'])
        corpus_pcoll = p | df.Create('create_corpus', [{
            'f': 'corpus1'
        }, {
            'f': 'corpus2'
        }, {
            'f': 'corpus3'
        }])
        words_pcoll = p | df.Create('create_words', [{
            'f': 'word1'
        }, {
            'f': 'word2'
        }, {
            'f': 'word3'
        }])
        ignore_corpus_pcoll = p | df.Create('create_ignore_corpus',
                                            ['corpus1'])
        ignore_word_pcoll = p | df.Create('create_ignore_word', ['word1'])

        groups = bigquery_side_input.create_groups(group_ids_pcoll,
                                                   corpus_pcoll, words_pcoll,
                                                   ignore_corpus_pcoll,
                                                   ignore_word_pcoll)

        def group_matcher(actual):
            self.assertEqual(len(actual), 3)
            for group in actual:
                self.assertEqual(len(group), 3)
                self.assertTrue(group[1].startswith('corpus'))
                self.assertNotEqual(group[1], 'corpus1')
                self.assertTrue(group[2].startswith('word'))
                self.assertNotEqual(group[2], 'word1')

        df.assert_that(groups, group_matcher)
        p.run()
コード例 #17
0
    def test_pardo_side_input(self):
        p = df.Pipeline('DirectPipelineRunner')
        words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = words | df.Map(len) | df.CombineGlobally(
            df.combiners.MeanCombineFn())

        # Call with explicit side inputs.
        small_words = words | df.FlatMap('small', filter_using_length, 0, 3)

        # A single deferred side input.
        larger_than_average = words | df.FlatMap(
            'large',
            filter_using_length,
            lower_bound=pvalue.AsSingleton(avg_word_len))

        # Mix and match.
        small_but_nontrivial = words | df.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc']))
        df.assert_that(larger_than_average,
                       df.equal_to(['ccc', 'dddd']),
                       label='larger_than_average')
        df.assert_that(small_but_nontrivial,
                       df.equal_to(['bb']),
                       label='small_but_not_trivial')
        p.run()
コード例 #18
0
ファイル: snippets.py プロジェクト: MMMdata/DataflowPythonSDK
def examples_wordcount_debugging(renames):
  """DebuggingWordCount example snippets.

  URL:
  https://cloud.google.com/dataflow/examples/wordcount-example#DebuggingWordCount
  """
  import re

  import google.cloud.dataflow as df
  from google.cloud.dataflow.utils.options import PipelineOptions

  # [START example_wordcount_debugging_logging]
  # [START example_wordcount_debugging_aggregators]
  import logging

  class FilterTextFn(df.DoFn):
    """A DoFn that filters for a specific key based on a regular expression."""

    # A custom aggregator can track values in your pipeline as it runs. Create
    # custom aggregators matched_word and unmatched_words.
    matched_words = df.Aggregator('matched_words')
    umatched_words = df.Aggregator('umatched_words')

    def __init__(self, pattern):
      self.pattern = pattern

    def process(self, context):
      word, _ = context.element
      if re.match(self.pattern, word):
        # Log at INFO level each element we match. When executing this pipeline
        # using the Dataflow service, these log lines will appear in the Cloud
        # Logging UI.
        logging.info('Matched %s', word)

        # Add 1 to the custom aggregator matched_words
        context.aggregate_to(self.matched_words, 1)
        yield context.element
      else:
        # Log at the "DEBUG" level each element that is not matched. Different
        # log levels can be used to control the verbosity of logging providing
        # an effective mechanism to filter less important information. Note
        # currently only "INFO" and higher level logs are emitted to the Cloud
        # Logger. This log message will not be visible in the Cloud Logger.
        logging.debug('Did not match %s', word)

        # Add 1 to the custom aggregator umatched_words
        context.aggregate_to(self.umatched_words, 1)
  # [END example_wordcount_debugging_logging]
  # [END example_wordcount_debugging_aggregators]

  p = df.Pipeline(options=PipelineOptions())
  filtered_words = (
      p
      | df.io.Read(df.io.TextFileSource(
          'gs://dataflow-samples/shakespeare/kinglear.txt'))
      | df.FlatMap('ExtractWords', lambda x: re.findall(r'[A-Za-z\']+', x))
      | df.combiners.Count.PerElement()
      | df.ParDo('FilterText', FilterTextFn('Flourish|stomach')))

  # [START example_wordcount_debugging_assert]
  df.assert_that(filtered_words, df.equal_to([('Flourish', 3), ('stomach', 1)]))
  # [END example_wordcount_debugging_assert]

  output = (filtered_words
            | df.Map('format', lambda (word, c): '%s: %s' % (word, c))
            | df.io.Write(
                'write', df.io.TextFileSink('gs://my-bucket/counts.txt')))

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()
コード例 #19
0
    logging.info('Creating temp file: %s', path)
    with open(path, 'w') as f:
      f.write(contents)

  def test_tfidf_transform(self):
    p = df.Pipeline('DirectPipelineRunner')
    uri_to_line = p | df.Create(
        'create sample',
        [('1.txt', 'abc def ghi'),
         ('2.txt', 'abc def'),
         ('3.txt', 'abc')])
    result = (
        uri_to_line
        | tfidf.TfIdf()
        | df.Map('flatten', lambda (word, (uri, tfidf)): (word, uri, tfidf)))
    df.assert_that(result, df.equal_to(EXPECTED_RESULTS))
    # Run the pipeline. Note that the assert_that above adds to the pipeline
    # a check that the result PCollection contains expected values. To actually
    # trigger the check the pipeline must be run.
    p.run()

  def test_basics(self):
    # Setup the files with expected content.
    temp_folder = tempfile.mkdtemp()
    self.create_file(os.path.join(temp_folder, '1.txt'), 'abc def ghi')
    self.create_file(os.path.join(temp_folder, '2.txt'), 'abc def')
    self.create_file(os.path.join(temp_folder, '3.txt'), 'abc')
    tfidf.run([
        '--uris=%s/*' % temp_folder,
        '--output', os.path.join(temp_folder, 'result')])
    # Parse result file and compare.
コード例 #20
0
 def test_basic_empty_missing(self):
   """Test that the correct empty result is returned for a missing month."""
   results = self._get_result_for_month(4)
   df.assert_that(results, df.equal_to([]))
   results.pipeline.run()
コード例 #21
0
def run(argv=None, assert_results=None):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_email',
        required=True,
        help='Email database, with each line formatted as "name<TAB>email".')
    parser.add_argument(
        '--input_phone',
        required=True,
        help='Phonebook, with each line formatted as "name<TAB>phone number".')
    parser.add_argument(
        '--input_snailmail',
        required=True,
        help='Address database, with each line formatted as "name<TAB>address".'
    )
    parser.add_argument('--output_tsv',
                        required=True,
                        help='Tab-delimited output file.')
    parser.add_argument('--output_stats',
                        required=True,
                        help='Output file for statistics about the input.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Helper: read a tab-separated key-value mapping from a text file, escape all
    # quotes/backslashes, and convert it a PCollection of (key, value) pairs.
    def read_kv_textfile(label, textfile):
        return (p
                | df.io.Read('read_%s' % label, textfile)
                | df.Map('backslash_%s' % label,
                         lambda x: re.sub(r'\\', r'\\\\', x))
                | df.Map('escape_quotes_%s' % label,
                         lambda x: re.sub(r'"', r'\"', x))
                | df.Map('split_%s' % label, lambda x: re.split(r'\t+', x, 1)))

    # Read input databases.
    email = read_kv_textfile('email',
                             df.io.TextFileSource(known_args.input_email))
    phone = read_kv_textfile('phone',
                             df.io.TextFileSource(known_args.input_phone))
    snailmail = read_kv_textfile(
        'snailmail', df.io.TextFileSource(known_args.input_snailmail))

    # Group together all entries under the same name.
    grouped = (email, phone, snailmail) | df.CoGroupByKey('group_by_name')

    # Prepare tab-delimited output; something like this:
    # "name"<TAB>"email_1,email_2"<TAB>"phone"<TAB>"first_snailmail_only"
    tsv_lines = grouped | df.Map(lambda
                                 (name, (email, phone, snailmail)): '\t'.join([
                                     '"%s"' % name,
                                     '"%s"' % ','.join(email),
                                     '"%s"' % ','.join(phone),
                                     '"%s"' % next(iter(snailmail), '')
                                 ]))

    # Compute some stats about our database of people.
    luddites = grouped | df.Filter(  # People without email.
        lambda (name, (email, phone, snailmail)): not next(iter(email), None))
    writers = grouped | df.Filter(  # People without phones.
        lambda (name, (email, phone, snailmail)): not next(iter(phone), None))
    nomads = grouped | df.Filter(  # People without addresses.
        lambda (name,
                (email, phone, snailmail)): not next(iter(snailmail), None))

    num_luddites = luddites | df.combiners.Count.Globally('luddites')
    num_writers = writers | df.combiners.Count.Globally('writers')
    num_nomads = nomads | df.combiners.Count.Globally('nomads')

    # Write tab-delimited output.
    # pylint: disable=expression-not-assigned
    tsv_lines | df.io.Write('write_tsv',
                            df.io.TextFileSink(known_args.output_tsv))

    # TODO(silviuc): Move the assert_results logic to the unit test.
    if assert_results is not None:
        expected_luddites, expected_writers, expected_nomads = assert_results
        df.assert_that(num_luddites,
                       df.equal_to([expected_luddites]),
                       label='assert:luddites')
        df.assert_that(num_writers,
                       df.equal_to([expected_writers]),
                       label='assert:writers')
        df.assert_that(num_nomads,
                       df.equal_to([expected_nomads]),
                       label='assert:nomads')
    # Execute pipeline.
    p.run()
コード例 #22
0
ファイル: mergecontacts.py プロジェクト: obulpathi/cloud
def run(argv=None, assert_results=None):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_email", required=True, help='Email database, with each line formatted as "name<TAB>email".'
    )
    parser.add_argument(
        "--input_phone", required=True, help='Phonebook, with each line formatted as "name<TAB>phone number".'
    )
    parser.add_argument(
        "--input_snailmail", required=True, help='Address database, with each line formatted as "name<TAB>address".'
    )
    parser.add_argument("--output_tsv", required=True, help="Tab-delimited output file.")
    parser.add_argument("--output_stats", required=True, help="Output file for statistics about the input.")
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Helper: read a tab-separated key-value mapping from a text file, escape all
    # quotes/backslashes, and convert it a PCollection of (key, value) pairs.
    def read_kv_textfile(label, textfile):
        return (
            p
            | df.io.Read("read_%s" % label, textfile)
            | df.Map("backslash_%s" % label, lambda x: re.sub(r"\\", r"\\\\", x))
            | df.Map("escape_quotes_%s" % label, lambda x: re.sub(r'"', r"\"", x))
            | df.Map("split_%s" % label, lambda x: re.split(r"\t+", x, 1))
        )

    # Read input databases.
    email = read_kv_textfile("email", df.io.TextFileSource(known_args.input_email))
    phone = read_kv_textfile("phone", df.io.TextFileSource(known_args.input_phone))
    snailmail = read_kv_textfile("snailmail", df.io.TextFileSource(known_args.input_snailmail))

    # Group together all entries under the same name.
    grouped = (email, phone, snailmail) | df.CoGroupByKey("group_by_name")

    # Prepare tab-delimited output; something like this:
    # "name"<TAB>"email_1,email_2"<TAB>"phone"<TAB>"first_snailmail_only"
    tsv_lines = grouped | df.Map(
        lambda (name, (email, phone, snailmail)): "\t".join(
            ['"%s"' % name, '"%s"' % ",".join(email), '"%s"' % ",".join(phone), '"%s"' % next(iter(snailmail), "")]
        )
    )

    # Compute some stats about our database of people.
    luddites = grouped | df.Filter(  # People without email.
        lambda (name, (email, phone, snailmail)): not next(iter(email), None)
    )
    writers = grouped | df.Filter(  # People without phones.
        lambda (name, (email, phone, snailmail)): not next(iter(phone), None)
    )
    nomads = grouped | df.Filter(  # People without addresses.
        lambda (name, (email, phone, snailmail)): not next(iter(snailmail), None)
    )

    num_luddites = luddites | df.combiners.Count.Globally("luddites")
    num_writers = writers | df.combiners.Count.Globally("writers")
    num_nomads = nomads | df.combiners.Count.Globally("nomads")

    # Write tab-delimited output.
    # pylint: disable=expression-not-assigned
    tsv_lines | df.io.Write("write_tsv", df.io.TextFileSink(known_args.output_tsv))

    # TODO(silviuc): Move the assert_results logic to the unit test.
    if assert_results is not None:
        expected_luddites, expected_writers, expected_nomads = assert_results
        df.assert_that(num_luddites, df.equal_to([expected_luddites]), label="assert:luddites")
        df.assert_that(num_writers, df.equal_to([expected_writers]), label="assert:writers")
        df.assert_that(num_nomads, df.equal_to([expected_nomads]), label="assert:nomads")
    # Execute pipeline.
    p.run()
コード例 #23
0
 def test_basic_empty(self):
   """Test that the correct empty result is returned for a simple dataset."""
   results = self._get_result_for_month(3)
   df.assert_that(results, df.equal_to([]))
   results.pipeline.run()