def run(argv=sys.argv[1:]):
    """Runs the workflow computing total points from a collection of matches."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Register the custom coder for the Player class, so that it will be used in
    # the computation.
    coders.registry.register_coder(Player, PlayerCoder)

    (p  # pylint: disable=expression-not-assigned
     | df.io.Read('read', df.io.TextFileSource(known_args.input))
     # The get_players function is annotated with a type hint above, so the type
     # system knows the output type of the following operation is a key-value pair
     # of a Player and an int. Please see the documentation for details on
     # types that are inferred automatically as well as other ways to specify
     # type hints.
     | df.Map('get players', get_players)
     # The output type hint of the previous step is used to infer that the key
     # type of the following operation is the Player type. Since a custom coder
     # is registered for the Player class above, a PlayerCoder will be used to
     # encode Player objects as keys for this combine operation.
     | df.CombinePerKey(sum) | df.Map(lambda (k, v): '%s,%d' % (k.name, v))
     | df.io.Write('write', df.io.TextFileSink(known_args.output)))
    p.run()
Ejemplo n.º 2
0
 def test_combine_per_key(self):
     player_accuracies = [('cat', 1), ('cat', 5), ('cat', 9), ('cat', 1),
                          ('dog', 5), ('dog', 2)]
     # [START combine_per_key]
     avg_accuracy_per_player = player_accuracies | df.CombinePerKey(
         df.combiners.MeanCombineFn())
     # [END combine_per_key]
     self.assertEqual({('cat', 4.0), ('dog', 3.5)},
                      set(avg_accuracy_per_player))
    def test_combine_per_key_with_custom_callable(self):
        """CombinePerKey using a custom function reducing iterables."""
        def multiply(values):
            result = 1
            for v in values:
                result *= v
            return result

        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(multiply))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 200), ('c', 100)]))
        result.pipeline.run()
    def test_combine_per_key_with_callable(self):
        """CombinePerKey using a standard callable reducing iterables.

    A common case for Dataflow combiners is to sum (or max or min) over the
    values of each key. Such standard functions can be used directly as combiner
    functions. In fact, any function "reducing" an iterable to a single value
    can be used.
    """
        result = (df.Pipeline(runner=df.runners.DirectPipelineRunner())
                  | df.Create(CombinersTest.SAMPLE_DATA)
                  | df.CombinePerKey(sum))

        df.assert_that(result, df.equal_to([('a', 6), ('b', 30), ('c', 100)]))
        result.pipeline.run()
    def test_top_shorthands(self):
        pipeline = Pipeline('DirectPipelineRunner')

        pcoll = pipeline | Create('start', [6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
        result_top = pcoll | df.CombineGlobally('top', combiners.Largest(5))
        result_bot = pcoll | df.CombineGlobally('bot', combiners.Smallest(4))
        assert_that(result_top,
                    equal_to([[9, 6, 6, 5, 3]]),
                    label='assert:top')
        assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot')

        pcoll = pipeline | Create(
            'start-perkey', [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
        result_ktop = pcoll | df.CombinePerKey('top-perkey',
                                               combiners.Largest(5))
        result_kbot = pcoll | df.CombinePerKey('bot-perkey',
                                               combiners.Smallest(4))
        assert_that(result_ktop,
                    equal_to([('a', [9, 6, 6, 5, 3])]),
                    label='k:top')
        assert_that(result_kbot,
                    equal_to([('a', [0, 1, 1, 1])]),
                    label='k:bot')
        pipeline.run()
Ejemplo n.º 6
0
  def Count(label, pcoll, factor=1):
    """Count as a decorated function with a side input.

    Args:
      label: optional label for this transform
      pcoll: the PCollection passed in from the previous transform
      factor: the amount by which to count

    Returns:
      A PCollection counting the number of times each unique element occurs.
    """
    return (
        pcoll
        | df.Map('Init', lambda v: (v, factor))
        | df.CombinePerKey(sum))
Ejemplo n.º 7
0
def run(argv=None):
    """Runs the workflow computing total points from a collection of matches."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    (p  # pylint: disable=expression-not-assigned
     | df.io.Read('read',
                  df.io.TextFileSource(known_args.input, coder=JsonCoder()))
     | df.FlatMap('points', compute_points) | df.CombinePerKey(sum)
     | df.io.Write('write',
                   df.io.TextFileSink(known_args.output, coder=JsonCoder())))
    p.run()
def count_tornadoes(input_data):
    """Workflow computing the number of tornadoes for each month that had one.

  Args:
    input_data: a PCollection of dictionaries representing table rows. Each
      dictionary will have a 'month' and a 'tornado' key as described in the
      module comment.

  Returns:
    A PCollection of dictionaries containing 'month' and 'tornado_count' keys.
    Months without tornadoes are skipped.
  """

    return (input_data
            | df.FlatMap(
                'months with tornadoes', lambda row: [(int(row['month']), 1)]
                if row['tornado'] else [])
            | df.CombinePerKey('monthly count', sum)
            | df.Map('format', lambda (k, v): {
                'month': k,
                'tornado_count': v
            }))
Ejemplo n.º 9
0
    def test_deterministic_key(self):
        p = df.Pipeline('DirectPipelineRunner', argv=sys.argv)
        lines = [
            'banana,fruit,3', 'kiwi,fruit,2', 'kiwi,fruit,2', 'zucchini,veg,3'
        ]

        # [START type_hints_deterministic_key]
        class Player(object):
            def __init__(self, team, name):
                self.team = team
                self.name = name

        class PlayerCoder(df.coders.Coder):
            def encode(self, player):
                return '%s:%s' % (player.team, player.name)

            def decode(self, s):
                return Player(*s.split(':'))

            def is_deterministic(self):
                return True

        df.coders.registry.register_coder(Player, PlayerCoder)

        def parse_player_and_score(csv):
            name, team, score = csv.split(',')
            return Player(team, name), int(score)

        totals = (lines
                  | df.Map(parse_player_and_score)
                  | df.CombinePerKey(sum).with_input_types(
                      df.typehints.Tuple[Player, int]))
        # [END type_hints_deterministic_key]

        self.assertEquals({('banana', 3), ('kiwi', 4), ('zucchini', 3)},
                          set(totals | df.Map(lambda (k, v): (k.name, v))))
input_files = 'gs://silviuc-dataflow/demo/datain*'
output_prefix = 'gs://silviuc-dataflow/demo/results/out'

args = [
    '--job_name', job_name, '--project', project, '--staging_location',
    staging_location, '--temp_location', temp_location, '--num_workers', '5',
    '--no_save_main_session', '--pipeline_type_check', '--runner',
    'BlockingDataflowPipelineRunner'
]
# ------------------------------ Remote execution -----------------------


def parse_record(e):
    r = json.loads(e)
    return r['ProductName'], r['Price']


p = df.Pipeline(argv=args)

(p
 | df.io.Read(df.io.TextFileSource(input_files))
 | df.Map(parse_record)
 | df.CombinePerKey(sum)
 | df.io.Write(df.io.TextFileSink(output_prefix)))

import logging
logging.getLogger().setLevel(logging.INFO)

p.run()
Ejemplo n.º 11
0
 def Count(label, pcoll):      # pylint: disable=invalid-name,unused-argument
   """Count as a decorated function."""
   return (
       pcoll
       | df.Map('Init', lambda v: (v, 1))
       | df.CombinePerKey(sum))
Ejemplo n.º 12
0
 def apply(self, pcoll):
   return (
       pcoll
       | df.Map('Init', lambda v: (v, 1))
       | df.CombinePerKey(sum))