Example #1
0
    def test_aggregation(self):

        mean = combiners.MeanCombineFn()
        mean.__name__ = 'mean'
        counter_types = [
            (sum, int, 6),
            (min, int, 0),
            (max, int, 3),
            (mean, int, 1),
            (sum, float, 6.0),
            (min, float, 0.0),
            (max, float, 3.0),
            (mean, float, 1.5),
            (any, int, True),
            (all, float, False),
        ]
        aggregators = [
            Aggregator('%s_%s' % (f.__name__, t.__name__), f, t)
            for f, t, _ in counter_types
        ]

        class UpdateAggregators(beam.DoFn):
            def process(self, context):
                for a in aggregators:
                    context.aggregate_to(a, context.element)

        p = TestPipeline()
        p | beam.Create([0, 1, 2, 3]) | beam.ParDo(UpdateAggregators())  # pylint: disable=expression-not-assigned
        res = p.run()
        for (_, _, expected), a in zip(counter_types, aggregators):
            actual = res.aggregated_values(a).values()[0]
            self.assertEqual(expected, actual)
            self.assertEqual(type(expected), type(actual))
Example #2
0
 def test_global_fanout(self):
     with TestPipeline() as p:
         result = (p
                   | beam.Create(range(100))
                   | beam.CombineGlobally(
                       combine.MeanCombineFn()).with_fanout(11))
         assert_that(result, equal_to([49.5]))
Example #3
0
 def test_hot_key_fanout(self):
     with TestPipeline() as p:
         result = (p
                   | beam.Create(
                       itertools.product(['hot', 'cold'], range(10)))
                   | beam.CombinePerKey(combine.MeanCombineFn()).
                   with_hot_key_fanout(lambda key: (key == 'hot') * 5))
         assert_that(result, equal_to([('hot', 4.5), ('cold', 4.5)]))
Example #4
0
 def test_tuple_combine_fn(self):
     with TestPipeline() as p:
         result = (p
                   | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
                   | beam.CombineGlobally(
                       combine.TupleCombineFn(max, combine.MeanCombineFn(),
                                              sum)).without_defaults())
         assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
Example #5
0
 def test_tuple_combine_fn_without_defaults(self):
     with TestPipeline() as p:
         result = (p
                   | Create([1, 1, 2, 3])
                   | beam.CombineGlobally(
                       combine.TupleCombineFn(
                           min, combine.MeanCombineFn(),
                           max).with_common_input()).without_defaults())
         assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
Example #6
0
  def test_MeanCombineFn_combine(self):
    with TestPipeline() as p:
      input = (
          p
          | beam.Create([('a', 1), ('a', 1), ('a', 4), ('b', 1), ('b', 13)]))
      # The mean of all values regardless of key.
      global_mean = (
          input
          | beam.Values()
          | beam.CombineGlobally(combine.MeanCombineFn()))

      # The (key, mean) pairs for all keys.
      mean_per_key = (input | beam.CombinePerKey(combine.MeanCombineFn()))

      expected_mean_per_key = [('a', 2), ('b', 7)]
      assert_that(global_mean, equal_to([4]), label='global mean')
      assert_that(
          mean_per_key, equal_to(expected_mean_per_key), label='mean per key')
Example #7
0
 def test_basic_combiners_display_data(self):
   transform = beam.CombineGlobally(
       combine.TupleCombineFn(max, combine.MeanCombineFn(), sum))
   dd = DisplayData.create_from(transform)
   expected_items = [
       DisplayDataItemMatcher('combine_fn', combine.TupleCombineFn),
       DisplayDataItemMatcher('combiners', "['max', 'MeanCombineFn', 'sum']")
   ]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #8
0
    def test_MeanCombineFn_combine_empty(self):
        # For each element in a PCollection, if it is float('NaN'), then emits
        # a string 'NaN', otherwise emits str(element).

        with TestPipeline() as p:
            input = (p | beam.Create([]))

            # Compute the mean of all values in the PCollection,
            # then format the mean. Since the Pcollection is empty,
            # the mean is float('NaN'), and is formatted to be a string 'NaN'.
            global_mean = (input
                           | beam.Values()
                           | beam.CombineGlobally(combine.MeanCombineFn())
                           | beam.Map(str))

            mean_per_key = (input
                            | beam.CombinePerKey(combine.MeanCombineFn()))

            # We can't compare one float('NaN') with another float('NaN'),
            # but we can compare one 'nan' string with another string.
            assert_that(global_mean, equal_to(['nan']), label='global mean')
            assert_that(mean_per_key, equal_to([]), label='mean per key')
Example #9
0
 def test_hot_key_fanout_sharded(self):
   # Lots of elements with the same key with varying/no fanout.
   with TestPipeline() as p:
     elements = [(None, e) for e in range(1000)]
     random.shuffle(elements)
     shards = [p | "Shard%s" % shard >> beam.Create(elements[shard::20])
               for shard in range(20)]
     result = (
         shards
         | beam.Flatten()
         | beam.CombinePerKey(combine.MeanCombineFn()).with_hot_key_fanout(
             lambda key: random.randrange(0, 5)))
     assert_that(result, equal_to([(None, 499.5)]))
Example #10
0
    def test_str(self):
        basic = Aggregator('a-name')
        self.assertEqual('<Aggregator a-name SumInt64Fn(int)>', str(basic))

        for_max = Aggregator('max-name', max)
        self.assertEqual('<Aggregator max-name MaxInt64Fn(int)>', str(for_max))

        for_float = Aggregator('f-name', sum, float)
        self.assertEqual('<Aggregator f-name SumFloatFn(float)>',
                         str(for_float))

        for_mean = Aggregator('m-name', combiners.MeanCombineFn(), float)
        self.assertEqual('<Aggregator m-name MeanFloatFn(float)>',
                         str(for_mean))
Example #11
0
def run():
    print("Town of Squirreliwink Bureau Of Tolls and Nuts Affair\n\n[PART-4]")

    # parse command line args:
    #   - parse both beam args and known script args
    parser = argparse.ArgumentParser(
        description="Town of Squirreliwink Bureau Of Tolls and Nuts Affair")
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        default='./data/input',
                        help='Input folder')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        default='./data/output',
                        help='Output folder')
    known_args, beam_args = parser.parse_known_args(sys.argv)

    # delete previous run files
    delete_files(os.path.join(known_args.output, "report*"))

    # construct pipeline and run
    options = PipelineOptions(beam_args)
    with beam.Pipeline(options=options) as pipeline:
        # create a pcollection of nut prices
        logger.info("creating nut prices side input")
        nut_prices = (pipeline
                      | beam.Create([('cornsilk', 2.0), ('slate_gray', 3.5),
                                     ('navajo_white', 7.0)]))

        # read toll records and pass in nut prices as a side_input
        # you can convert a (k, v) tuple pcollection into a {k: v} with beam.pvalue.AsDict()
        logger.info("reading toll records")
        records = (pipeline
                   | beam.io.ReadFromText(os.path.join(known_args.input,
                                                       'tollbooth_logs.csv'),
                                          skip_header_lines=1)
                   | beam.Map(parse_csv)
                   | beam.ParDo(PrepareAndAddTotalsWithSideInput(),
                                nut_prices=beam.pvalue.AsDict(nut_prices)))

        # multi-keys multi-values combiner by using beam.combiners.TupleCombineFn()
        # first normalize rows into ((license_plate, month), (1, total, cornsilk, slate gray, navajo white, total)) tuple
        # then apply a tuple of combiners over values
        records = (records
                   | beam.Map(key_by_license_plate_month)
                   | beam.CombinePerKey(
                       beam.combiners.TupleCombineFn(combine.CountCombineFn(),
                                                     sum, sum, sum, sum,
                                                     combine.MeanCombineFn())))

        # read squirreliwink population file
        # file consist of newline delimited json rows. read each json row as dict
        logger.info("reading Squirreliwink's residents file")
        residents = (pipeline
                     | "residents" >> beam.io.ReadFromText(
                         os.path.join(known_args.input,
                                      'squirreliwink_population.json'))
                     | beam.Map(lambda line: json.loads(line)))

        # key residents by their license plate
        logger.info("key residents by license_plate")
        residents_by_plate = (
            residents | beam.Map(lambda element: (element['car'], element)))

        # lookup residents by their license plate using SideInputs
        records = (
            records
            | beam.Map(
                lambda e, lookup: (
                    # add family_name and address from resident lookup to the keys tuple.
                    # Remember e[0][0] (first value in the keys tuple) should contain our license_plate info
                    (e[0] + tuple(v for k, v in lookup[e[0][0]].items() if k in
                                  ('family_name', 'address'))),
                    e[1]),
                lookup=beam.pvalue.AsDict(residents_by_plate)
            )  # pass in residents info as a SideInput
        )

        # (records | beam.Map(print))

        # output to a newline delimited json file
        logger.info("output record into csv file")
        (records
         | beam.Map(
             lambda e: e[0] + e[1]
         )  # flatten ((keys), (values)) tuple into a single tuple (keys + values)
         | beam.Map(lambda t: dict(
             zip(  # stitch up the results as a dict, adding back column names
                 ('license_plate', 'month', 'family_name', 'address',
                  'visit_count', 'total', 'cornsilk', 'slate_gray',
                  'navajo_white', 'avg_total'), t)))
         | beam.Map(lambda d: json.dumps(d, ensure_ascii=False)
                    )  # json output the results
         | beam.io.WriteToText(os.path.join(known_args.output, "report"),
                               file_name_suffix='.json'))