Ejemplo n.º 1
0
    def preprocess(self, input_path, input_dict, output_path):
        """

    Args:
      input_path: Input specified as uri to CSV file. Each line of csv file
                  contains colon-separated GCS uri to an image and labels
      input_dict: Input dictionary. Specified as text file uri.
                  Each line of the file stores one label.
    """
        opt = self.pipeline_options.view_as(PrepareImagesOptions)
        p = df.Pipeline(options=self.pipeline_options)

        # Read input data.
        csv_data = df.io.TextFileSource(input_path,
                                        strip_trailing_newlines=True)
        dict_data = df.io.TextFileSource(input_dict,
                                         strip_trailing_newlines=True)
        labels = (p | df.Read(StageName.READ_DICTIONARY, dict_data))
        content = (p | df.Read(StageName.READ_CSV, csv_data)
                   | df.Map(StageName.PARSE_CSV,
                            lambda line: csv.reader([line]).next())
                   | df.ParDo(StageName.EXTRACT_LABEL_IDS,
                              ExtractLabelIdsDoFn(), df.pvalue.AsIter(labels))
                   | df.ParDo(StageName.READ_IMAGE, ExtractImageDoFn()))

        # Process input data using common transformations.
        image_graph_uri = os.path.join(opt.input_data_location,
                                       Default.IMAGE_GRAPH_FILENAME)
        examples = (
            content
            | df.ParDo(
                StageName.CONVERT_IMAGE,
                ResizeImageDoFn(Default.IMAGE_TYPE, opt.max_image_width,
                                opt.max_image_height))
            | df.ParDo(
                StageName.ENCODE_EXAMPLE,
                EncodeExampleDoFn(image_graph_uri,
                                  opt.image_graph_jpeg_input_tensor,
                                  opt.image_graph_output_tensor,
                                  opt.training_data_percentage)))

        # Write in JSON format to Text file.
        # Remove redundant whitespace for more compact representation.
        # Images/labels are base64 encoded so will not contain spaces.
        to_json = lambda x: re.sub(r'\s+', ' ', json_format.MessageToJson(x[0])
                                   )

        for dataset in Dataset.ALL:
            _ = (examples
                 | df.Filter(StageName.FILTER + dataset,
                             lambda x, dataset=dataset: x[1] == dataset)
                 | df.Map(StageName.TO_JSON + dataset, to_json)
                 | df.Write(
                     StageName.SAVE + dataset,
                     df.io.TextFileSink('{}.{}.json'.format(
                         output_path, dataset),
                                        num_shards=opt.output_shard_count)))

        # Execute the pipeline.
        p.run()
def run(argv=None):
    """Build and run the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        required=True,
        help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    parser.add_argument(
        '--output_topic',
        required=True,
        help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.PubSubSource(known_args.input_topic))

    # Capitalize the characters in each line.
    transformed = (lines
                   | (df.FlatMap('split', lambda x: re.findall(
                       r'[A-Za-z\']+', x)).with_output_types(unicode))
                   | df.Map('pair_with_one', lambda x: (x, 1))
                   | df.WindowInto(window.FixedWindows(15, 0))
                   | df.GroupByKey('group')
                   | df.Map('count', lambda (word, ones): (word, sum(ones)))
                   | df.Map('format', lambda tup: '%s: %d' % tup))

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    transformed | df.io.Write('pubsub_write',
                              df.io.PubSubSink(known_args.output_topic))

    p.run()
Ejemplo n.º 3
0
 def read_kv_textfile(label, textfile):
     return (p
             | df.io.Read('read_%s' % label, textfile)
             | df.Map('backslash_%s' % label,
                      lambda x: re.sub(r'\\', r'\\\\', x))
             | df.Map('escape_quotes_%s' % label,
                      lambda x: re.sub(r'"', r'\"', x))
             | df.Map('split_%s' % label, lambda x: re.split(r'\t+', x, 1)))
Ejemplo n.º 4
0
    def _run_repeat_test_good(self, repeat):
        # As a positional argument.
        result = ['a', 'bb', 'c'] | df.Map(repeat, 3)
        self.assertEqual(['aaa', 'bbbbbb', 'ccc'], sorted(result))

        # As a keyword argument.
        result = ['a', 'bb', 'c'] | df.Map(repeat, times=3)
        self.assertEqual(['aaa', 'bbbbbb', 'ccc'], sorted(result))
Ejemplo n.º 5
0
 def _run_repeat_test_bad(self, repeat):
     # Various mismatches.
     with self.assertRaises(typehints.TypeCheckError):
         ['a', 'bb', 'c'] | df.Map(repeat, 'z')
     with self.assertRaises(typehints.TypeCheckError):
         ['a', 'bb', 'c'] | df.Map(repeat, times='z')
     with self.assertRaises(typehints.TypeCheckError):
         ['a', 'bb', 'c'] | df.Map(repeat, 3, 4)
     if not inspect.getargspec(repeat).defaults:
         with self.assertRaises(typehints.TypeCheckError):
             ['a', 'bb', 'c'] | df.Map(repeat)
Ejemplo n.º 6
0
def run(argv=None):
    # pylint: disable=expression-not-assigned

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file pattern to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file pattern to write results to.')
    parser.add_argument('--checksum_output',
                        required=True,
                        help='Checksum output file pattern.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    output = (lines
              | df.Map('split', lambda x: (x[:10], x[10:99]))
              | df.GroupByKey('group')
              | df.FlatMap(
                  'format', lambda
                  (key, vals): ['%s%s' % (key, val) for val in vals]))

    input_csum = (lines
                  | df.Map('input-csum', crc32line)
                  | df.CombineGlobally('combine-input-csum', sum)
                  | df.Map('hex-format', lambda x: '%x' % x))
    input_csum | df.io.Write(
        'write-input-csum',
        df.io.TextFileSink(known_args.checksum_output + '-input'))

    # Write the output using a "Write" transform that has side effects.
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))
    # Write the output checksum
    output_csum = (output
                   | df.Map('output-csum', crc32line)
                   | df.CombineGlobally('combine-output-csum', sum)
                   | df.Map('hex-format-output', lambda x: '%x' % x))
    output_csum | df.io.Write(
        'write-output-csum',
        df.io.TextFileSink(known_args.checksum_output + '-output'))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Ejemplo n.º 7
0
    def test_deferred_side_input_iterable(self):
        @typehints.with_input_types(str, typehints.Iterable[str])
        def concat(glue, items):
            return glue.join(sorted(items))

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', ['x', 'y', 'z'])
        result = main_input | df.Map(concat, pvalue.AsIter(side_input))
        assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))
        p.run()

        bad_side_input = p | df.Create('bad_side', [1, 2, 3])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
Ejemplo n.º 8
0
    def test_non_function(self):
        result = ['a', 'bb', 'c'] | df.Map(str.upper)
        self.assertEqual(['A', 'BB', 'C'], sorted(result))

        result = ['xa', 'bbx', 'xcx'] | df.Map(str.strip, 'x')
        self.assertEqual(['a', 'bb', 'c'], sorted(result))

        result = ['1', '10', '100'] | df.Map(int)
        self.assertEqual([1, 10, 100], sorted(result))

        result = ['1', '10', '100'] | df.Map(int, 16)
        self.assertEqual([1, 16, 256], sorted(result))

        with self.assertRaises(typehints.TypeCheckError):
            [1, 2, 3] | df.Map(str.upper)
Ejemplo n.º 9
0
    def test_pardo_using_map(self):
        words = ['aa', 'bbb', 'c']
        # [START model_pardo_using_map]
        word_lengths = words | df.Map(len)
        # [END model_pardo_using_map]

        self.assertEqual({2, 3, 1}, set(word_lengths))
Ejemplo n.º 10
0
 def test_runtime_checks_on(self):
     p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
     with self.assertRaises(typehints.TypeCheckError):
         # [START type_hints_runtime_on]
         p.options.view_as(TypeOptions).runtime_type_check = True
         p | df.Create(['a']) | df.Map(lambda x: 3).with_output_types(str)
         p.run()
Ejemplo n.º 11
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))
    counts = (lines
              | (df.ParDo('split',
                          WordExtractingDoFn()).with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    result = p.run()
    empty_line_values = result.aggregated_values(empty_line_aggregator)
    logging.info('number of empty lines: %d', sum(empty_line_values.values()))
    word_length_values = result.aggregated_values(average_word_size_aggregator)
    logging.info('average word lengths: %s', word_length_values.values())
def run(argv=sys.argv[1:]):
    """Runs the workflow computing total points from a collection of matches."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Register the custom coder for the Player class, so that it will be used in
    # the computation.
    coders.registry.register_coder(Player, PlayerCoder)

    (p  # pylint: disable=expression-not-assigned
     | df.io.Read('read', df.io.TextFileSource(known_args.input))
     # The get_players function is annotated with a type hint above, so the type
     # system knows the output type of the following operation is a key-value pair
     # of a Player and an int. Please see the documentation for details on
     # types that are inferred automatically as well as other ways to specify
     # type hints.
     | df.Map('get players', get_players)
     # The output type hint of the previous step is used to infer that the key
     # type of the following operation is the Player type. Since a custom coder
     # is registered for the Player class above, a PlayerCoder will be used to
     # encode Player objects as keys for this combine operation.
     | df.CombinePerKey(sum) | df.Map(lambda (k, v): '%s,%d' % (k.name, v))
     | df.io.Write('write', df.io.TextFileSink(known_args.output)))
    p.run()
Ejemplo n.º 13
0
    def test_bad_main_input(self):
        @typehints.with_input_types(str, int)
        def repeat(s, times):
            return s * times

        with self.assertRaises(typehints.TypeCheckError):
            [1, 2, 3] | df.Map(repeat, 3)
Ejemplo n.º 14
0
    def test_deferred_side_inputs(self):
        @typehints.with_input_types(str, int)
        def repeat(s, times):
            return s * times

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', [3])
        result = main_input | df.Map(repeat, pvalue.AsSingleton(side_input))
        assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc']))
        p.run()

        bad_side_input = p | df.Create('bad_side', ['z'])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('again', repeat,
                                pvalue.AsSingleton(bad_side_input))
Ejemplo n.º 15
0
    def test_varargs_side_input_hint(self):
        @typehints.with_input_types(str, int)
        def repeat(s, *times):
            return s * times[0]

        result = ['a', 'bb', 'c'] | df.Map(repeat, 3)
        self.assertEqual(['aaa', 'bbbbbb', 'ccc'], sorted(result))
def run(argv=None):
  """Build and run the pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input_topic', dest='input_topic', required=True,
      help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
  parser.add_argument(
      '--output_topic', dest='output_topic', required=True,
      help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
  known_args, pipeline_args = parser.parse_known_args(argv)

  p = df.Pipeline(argv=pipeline_args)

  # Read the text file[pattern] into a PCollection.
  lines = p | df.io.Read(
      'read', df.io.PubSubSource(known_args.input_topic))

  # Capitalize the characters in each line.
  transformed = (lines
                 | (df.Map('capitalize', lambda x: x.upper())))

  # Write to PubSub.
  # pylint: disable=expression-not-assigned
  transformed | df.io.Write(
      'pubsub_write', df.io.PubSubSink(known_args.output_topic))

  p.run()
Ejemplo n.º 17
0
 def apply(self, pcoll):
     return (pcoll
             | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)
                           ).with_output_types(unicode))
             | df.Map('pair_with_one', lambda x: (x, 1))
             | df.GroupByKey('group')
             | df.Map('count', lambda (word, ones): (word, sum(ones))))
Ejemplo n.º 18
0
    def test_loose_bounds(self):
        @typehints.with_input_types(typehints.Union[int, float, long])
        @typehints.with_output_types(basestring)
        def format_number(x):
            return '%g' % x

        result = [1, 2, 3] | df.Map(format_number)
        self.assertEqual(['1', '2', '3'], sorted(result))
Ejemplo n.º 19
0
    def test_pardo_with_label(self):
        words = ['aa', 'bbc', 'defg']
        # [START model_pardo_with_label]
        result = words | df.Map('CountUniqueLetters',
                                lambda word: len(set(word)))
        # [END model_pardo_with_label]

        self.assertEqual({1, 2, 4}, set(result))
Ejemplo n.º 20
0
    def test_bad_types(self):
        p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)

        # [START type_hints_missing_define_numbers]
        numbers = p | df.Create(['1', '2', '3'])
        # [END type_hints_missing_define_numbers]

        # Consider the following code.
        # [START type_hints_missing_apply]
        evens = numbers | df.Filter(lambda x: x % 2 == 0)
        # [END type_hints_missing_apply]

        # Now suppose numers was defined as [snippet above].
        # When running this pipeline, you'd get a runtime error,
        # possibly on a remote machine, possibly very late.

        with self.assertRaises(TypeError):
            p.run()

        # To catch this early, we can assert what types we expect.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_takes]
            p.options.view_as(TypeOptions).pipeline_type_check = True
            evens = numbers | df.Filter(lambda x: x % 2 == 0).with_input_types(
                int)
            # [END type_hints_takes]

        # Type hints can be declared on DoFns and callables as well, rather
        # than where they're used, to be more self contained.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_do_fn]
            @df.typehints.with_input_types(int)
            class FilterEvensDoFn(df.DoFn):
                def process(self, context):
                    if context.element % 2 == 0:
                        yield context.element

            evens = numbers | df.ParDo(FilterEvensDoFn())
            # [END type_hints_do_fn]

        words = p | df.Create('words', ['a', 'bb', 'c'])
        # One can assert outputs and apply them to transforms as well.
        # Helps document the contract and checks it at pipeline construction time.
        # [START type_hints_transform]
        T = df.typehints.TypeVariable('T')

        @df.typehints.with_input_types(T)
        @df.typehints.with_output_types(df.typehints.Tuple[int, T])
        class MyTransform(df.PTransform):
            def apply(self, pcoll):
                return pcoll | df.Map(lambda x: (len(x), x))

        words_with_lens = words | MyTransform()
        # [END type_hints_transform]

        with self.assertRaises(typehints.TypeCheckError):
            words_with_lens | df.Map(lambda x: x).with_input_types(
                df.typehints.Tuple[int, int])
Ejemplo n.º 21
0
def read_documents(pipeline, uris):
    """Read the documents at the provided uris and returns (uri, line) pairs."""
    pcolls = []
    for uri in uris:
        pcolls.append(pipeline
                      | df.io.Read('read: %s' % uri, df.io.TextFileSource(uri))
                      | df.Map('withkey: %s' %
                               uri, lambda v, uri: (uri, v), uri))
    return pcolls | df.Flatten('flatten read pcolls')
Ejemplo n.º 22
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectPipelineRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    counts = (lines
              | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)).
                 with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Ejemplo n.º 23
0
def run(argv=None):
    """Runs the workflow counting the long words and short words separately."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output prefix for files to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    lines = p | df.Read('read', df.io.TextFileSource(known_args.input))

    # with_outputs allows accessing the side outputs of a DoFn.
    split_lines_result = (
        lines
        | df.ParDo(SplitLinesToWordsFn()).with_outputs(
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS,
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT,
            main='words'))

    # split_lines_result is an object of type DoOutputsTuple. It supports
    # accessing result in alternative ways.
    words, _, _ = split_lines_result
    short_words = split_lines_result[
        SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS]
    character_count = split_lines_result.tag_character_count

    # pylint: disable=expression-not-assigned
    (character_count
     | df.Map('pair_with_key', lambda x: ('chars_temp_key', x))
     | df.GroupByKey()
     | df.Map('count chars', lambda (_, counts): sum(counts))
     | df.Write('write chars',
                df.io.TextFileSink(known_args.output + '-chars')))

    # pylint: disable=expression-not-assigned
    (short_words
     | CountWords('count short words')
     | df.Write('write short words',
                df.io.TextFileSink(known_args.output + '-short-words')))

    # pylint: disable=expression-not-assigned
    (words
     | CountWords('count words')
     | df.Write('write words',
                df.io.TextFileSink(known_args.output + '-words')))

    p.run()
Ejemplo n.º 24
0
def generate_julia_set_colors(pipeline, c, n, max_iterations):
    """Compute julia set coordinates for each point in our set."""
    def point_set(n):
        for x in range(n):
            for y in range(n):
                yield (x, y)

    julia_set_colors = (pipeline
                        | df.Create('add points', point_set(n))
                        | df.Map(get_julia_set_point_color, c, n,
                                 max_iterations))

    return julia_set_colors
Ejemplo n.º 25
0
def filter_cold_days(input_data, month_filter):
    """Workflow computing rows in a specific month with low temperatures.

  Args:
    input_data: a PCollection of dictionaries representing table rows. Each
      dictionary must have the keys ['year', 'month', 'day', and 'mean_temp'].
    month_filter: an int representing the month for which colder-than-average
      days should be returned.

  Returns:
    A PCollection of dictionaries with the same keys described above. Each
      row represents a day in the specified month where temperatures were
      colder than the global mean temperature in the entire dataset.
  """

    # Project to only the desired fields from a complete input row.
    # E.g., SELECT f1, f2, f3, ... FROM InputTable.
    projection_fields = ['year', 'month', 'day', 'mean_temp']
    fields_of_interest = (
        input_data
        | df.Map('projected',
                 lambda row: {f: row[f]
                              for f in projection_fields}))

    # Compute the global mean temperature.
    global_mean = AsSingleton(
        fields_of_interest
        | df.Map('extract mean', lambda row: row['mean_temp'])
        | df.combiners.Mean.Globally('global mean'))

    # Filter to the rows representing days in the month of interest
    # in which the mean daily temperature is below the global mean.
    return (
        fields_of_interest
        | df.Filter('desired month', lambda row: row['month'] == month_filter)
        | df.Filter('below mean', lambda row, mean: row['mean_temp'] < mean,
                    global_mean))
Ejemplo n.º 26
0
  def Count(label, pcoll, factor=1):
    """Count as a decorated function with a side input.

    Args:
      label: optional label for this transform
      pcoll: the PCollection passed in from the previous transform
      factor: the amount by which to count

    Returns:
      A PCollection counting the number of times each unique element occurs.
    """
    return (
        pcoll
        | df.Map('Init', lambda v: (v, factor))
        | df.CombinePerKey(sum))
Ejemplo n.º 27
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    # A thousand work items of a million tries each.
    (p  # pylint: disable=expression-not-assigned
     | df.Create('Initialize', [100000] * 100).with_output_types(int)
     | df.Map('Run trials', run_trials)
     | df.CombineGlobally('Sum', combine_results).without_defaults()
     | df.io.Write('Write',
                   df.io.TextFileSink(known_args.output, coder=JsonCoder())))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Ejemplo n.º 28
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    counts = (lines
              | (df.ParDo('split',
                          WordExtractingDoFn()).with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    empty_line_values = result.aggregated_values(empty_line_aggregator)
    logging.info('number of empty lines: %d', sum(empty_line_values.values()))
    word_length_values = result.aggregated_values(average_word_size_aggregator)
    logging.info('average word lengths: %s', word_length_values.values())
Ejemplo n.º 29
0
    def test_pardo_side_input(self):
        p = df.Pipeline('DirectPipelineRunner')
        words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = words | df.Map(len) | df.CombineGlobally(
            df.combiners.MeanCombineFn())

        # Call with explicit side inputs.
        small_words = words | df.FlatMap('small', filter_using_length, 0, 3)

        # A single deferred side input.
        larger_than_average = words | df.FlatMap(
            'large',
            filter_using_length,
            lower_bound=pvalue.AsSingleton(avg_word_len))

        # Mix and match.
        small_but_nontrivial = words | df.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc']))
        df.assert_that(larger_than_average,
                       df.equal_to(['ccc', 'dddd']),
                       label='larger_than_average')
        df.assert_that(small_but_nontrivial,
                       df.equal_to(['bb']),
                       label='small_but_not_trivial')
        p.run()
Ejemplo n.º 30
0
    def test_deterministic_key(self):
        p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
        lines = [
            'banana,fruit,3', 'kiwi,fruit,2', 'kiwi,fruit,2', 'zucchini,veg,3'
        ]

        # [START type_hints_deterministic_key]
        class Player(object):
            def __init__(self, team, name):
                self.team = team
                self.name = name

        class PlayerCoder(df.coders.Coder):
            def encode(self, player):
                return '%s:%s' % (player.team, player.name)

            def decode(self, s):
                return Player(*s.split(':'))

            def is_deterministic(self):
                return True

        df.coders.registry.register_coder(Player, PlayerCoder)

        def parse_player_and_score(csv):
            name, team, score = csv.split(',')
            return Player(team, name), int(score)

        totals = (lines
                  | df.Map(parse_player_and_score)
                  | df.CombinePerKey(sum).with_input_types(
                      df.typehints.Tuple[Player, int]))
        # [END type_hints_deterministic_key]

        self.assertEquals({('banana', 3), ('kiwi', 4), ('zucchini', 3)},
                          set(totals | df.Map(lambda (k, v): (k.name, v))))