Beispiel #1
0
  def test_rewindow_regroup(self):
    with TestPipeline() as p:
      grouped = (p
                 | Create(range(5))
                 | Map(lambda t: TimestampedValue(('key', t), t))
                 | 'window' >> WindowInto(FixedWindows(5, offset=3))
                 | GroupByKey()
                 | MapTuple(lambda k, vs: (k, sorted(vs))))
      # Both of these group-and-ungroup sequences should be idempotent.
      regrouped1 = (grouped
                    | 'w1' >> WindowInto(FixedWindows(5, offset=3))
                    | 'g1' >> GroupByKey()
                    | FlatMapTuple(lambda k, vs: [(k, v) for v in vs]))
      regrouped2 = (grouped
                    | FlatMapTuple(lambda k, vs: [(k, v) for v in vs])
                    | 'w2' >> WindowInto(FixedWindows(5, offset=3))
                    | 'g2' >> GroupByKey()
                    | MapTuple(lambda k, vs: (k, sorted(vs))))
      with_windows = Map(lambda e, w=beam.DoFn.WindowParam: (e, w))
      expected = [(('key', [0, 1, 2]), IntervalWindow(-2, 3)),
                  (('key', [3, 4]), IntervalWindow(3, 8))]

      assert_that(grouped | 'ww' >> with_windows, equal_to(expected))
      assert_that(
          regrouped1 | 'ww1' >> with_windows, equal_to(expected), label='r1')
      assert_that(
          regrouped2 | 'ww2' >> with_windows, equal_to(expected), label='r2')
Beispiel #2
0
  def test_window_assignment_through_multiple_gbk_idempotency(self):
    with TestPipeline() as p:
      pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4)
      result = (pcoll
                | 'window' >> WindowInto(FixedWindows(2))
                | 'gbk' >> GroupByKey()
                | 'same window' >> WindowInto(FixedWindows(2))
                | 'another gbk' >> GroupByKey()
                | 'same window again' >> WindowInto(FixedWindows(2))
                | 'gbk again' >> GroupByKey())

      assert_that(result, equal_to([('key', [[[0]]]),
                                    ('key', [[[2]]]),
                                    ('key', [[[4]]])]))
Beispiel #3
0
    def expand(self, pcoll):
        # This is a composite transform involves the following:
        #   1. Create a singleton of the user provided `query` and apply a ``ParDo``
        #   that splits the query into `num_splits` and assign each split query a
        #   unique `int` as the key. The resulting output is of the type
        #   ``PCollection[(int, Query)]``.
        #
        #   If the value of `num_splits` is less than or equal to 0, then the
        #   number of splits will be computed dynamically based on the size of the
        #   data for the `query`.
        #
        #   2. The resulting ``PCollection`` is sharded using a ``GroupByKey``
        #   operation. The queries are extracted from the (int, Iterable[Query]) and
        #   flattened to output a ``PCollection[Query]``.
        #
        #   3. In the third step, a ``ParDo`` reads entities for each query and
        #   outputs a ``PCollection[Entity]``.

        queries = (pcoll.pipeline
                   | 'UserQuery' >> Create([self._query])
                   | 'SplitQuery' >> ParDo(
                       ReadFromDatastore.SplitQueryFn(
                           self._project, self._query,
                           self._datastore_namespace, self._num_splits)))

        sharded_queries = (queries
                           | GroupByKey()
                           | Values()
                           | 'Flatten' >> FlatMap(lambda x: x))

        entities = sharded_queries | 'Read' >> ParDo(
            ReadFromDatastore.ReadFn(self._project, self._datastore_namespace))
        return entities
Beispiel #4
0
 def test_timestamped_value(self):
   with TestPipeline() as p:
     result = (p
               | 'start' >> Create([(k, k) for k in range(10)])
               | Map(lambda x_t: TimestampedValue(x_t[0], x_t[1]))
               | 'w' >> WindowInto(FixedWindows(5))
               | Map(lambda v: ('key', v))
               | GroupByKey())
     assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                   ('key', [5, 6, 7, 8, 9])]))
Beispiel #5
0
 def test_sliding_windows(self):
   with TestPipeline() as p:
     pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
     result = (pcoll
               | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
               | GroupByKey()
               | reify_windows)
     expected = [('key @ [-2.0, 2.0)', [1]),
                 ('key @ [0.0, 4.0)', [1, 2, 3]),
                 ('key @ [2.0, 6.0)', [2, 3])]
     assert_that(result, equal_to(expected))
Beispiel #6
0
    def test_window_assignment_idempotency(self):
        with TestPipeline() as p:
            pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4)
            result = (pcoll
                      | 'window' >> WindowInto(FixedWindows(2))
                      | 'same window' >> WindowInto(FixedWindows(2))
                      | 'same window again' >> WindowInto(FixedWindows(2))
                      | GroupByKey())

            assert_that(result,
                        equal_to([('key', [0]), ('key', [2]), ('key', [4])]))
 def test_sessions(self):
     with TestPipeline() as p:
         pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
         result = (pcoll
                   | 'w' >> WindowInto(Sessions(10))
                   | GroupByKey()
                   | sort_values
                   | reify_windows)
         expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
                     ('key @ [20.0, 45.0)', [20, 27, 35])]
         assert_that(result, equal_to(expected))
Beispiel #8
0
 def test_custom_windows(self):
   with TestPipeline() as p:
     pcoll = self.timestamped_key_values(p, 'key', 0, 1, 2, 3, 4, 5, 6)
     # pylint: disable=abstract-class-instantiated
     result = (pcoll
               | 'custom window' >> WindowInto(TestCustomWindows())
               | GroupByKey()
               | 'sort values' >> MapTuple(lambda k, vs: (k, sorted(vs))))
     assert_that(result, equal_to([('key', [0, 1, 2]),
                                   ('key', [3, 4]),
                                   ('key', [5]),
                                   ('key', [6])]))
Beispiel #9
0
 def test_rewindow(self):
   with TestPipeline() as p:
     result = (p
               | Create([(k, k) for k in range(10)])
               | Map(lambda x_t1: TimestampedValue(x_t1[0], x_t1[1]))
               | 'window' >> WindowInto(SlidingWindows(period=2, size=6))
               # Per the model, each element is now duplicated across
               # three windows. Rewindowing must preserve this duplication.
               | 'rewindow' >> WindowInto(FixedWindows(5))
               | 'rewindow2' >> WindowInto(FixedWindows(5))
               | Map(lambda v: ('key', v))
               | GroupByKey())
     assert_that(result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)),
                                   ('key', sorted([5, 6, 7, 8, 9] * 3))]))
Beispiel #10
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--startDate',
                        dest='startDate',
                        type = parse_string_date,
                        default=(datetime.now() - timedelta(days=1)).replace(
                            hour=0,
                            minute=0,
                            second=0,
                            microsecond=0),
                        help='Start date.')
    parser.add_argument('--endDate',
                        dest='endDate',
                        type=parse_string_date,
                        default=datetime.now().replace(
                            hour=23,
                            minute=59,
                            second=59,
                            microsecond=999999),
                        help='End date.')

    known_args, pipeline_args = parser.parse_known_args(argv)


    time_boundaries_list = collection_range_timestamps(startDate=known_args.startDate,
                                                       endDate=known_args.endDate,
                                                       delta=timedelta(hours=1),
                                                       return_as_list=True)

    # time_boundaries_list = collection_range_timestamps(startDate=datetime(2019, 1, 29, 0, 0, 0),
    #                                                   endDate=datetime(2019, 1, 29, 1, 0, 0),
    #                                                   delta=timedelta(hours=1),
    #                                                   return_as_list=True)


    pg = PostgresDb()

    # prepare pipelines and collections
    pipeline_options = PipelineOptions(pipeline_args=pipeline_args)

    heroes = get_heroes()

    # perform source data
    for time_boundaries_bulk_list in list_chunks(time_boundaries_list, 10):
        with beam.Pipeline(options=pipeline_options) as p:
            t_boundaries_sources = (p | 'next_time_boundaries_bulk' >> beam.Create(time_boundaries_bulk_list))

            (t_boundaries_sources
             | 'sql_prepare_matches_players' >> beam.ParDo(DoFnBuidQueryMatchesPlayers())
             | 'sql_execute_matches_players' >> beam.ParDo(DoFnExecuteSql(table_tag='matches_players', pg_db=pg))
             | 'matches_group_by_match_id' >> GroupByKey()
             | 'enrich_and_split_and_save' >> beam.ParDo(DoFnEnrichSplitByHeroes())
             | 'matches_group_by_hero_id' >> GroupByKey()
             | 'update_hero_impact' >> beam.ParDo(DoFnDBUpdateHeroImpact(pg_db=pg))
            )

    # aggregations
    with beam.Pipeline(options=pipeline_options) as p:
        (p
         | 'last_patches' >> collection_patches(pg_db=pg)
         | 'kv_patches_heroes' >> beam.ParDo(DoFnKVHeroesPatches(heroes=heroes))
         | 'patches_group_by_hero_id' >> GroupByKey()
         | 'sql_heroes_patches' >> beam.ParDo(DoFnHeroesPatchData(pg_db=pg))
         | 'heroes_patches_group_by_key' >> GroupByKey()
         | 'calc_total_impact' >> beam.ParDo(DoFnCalcTotalImpact(pg_db=pg))
         | 'aggregate_tuples' >> GroupByKey()
         | 'update_hero_patch_impact' >> beam.ParDo(DoFnDBUpdateHeroPatchImpact(pg_db=pg))
        )
def run(argv=None):
    # TODO: DROP indexes on purch log in DB

    parser = argparse.ArgumentParser()
    parser.add_argument('--startDate',
                        dest='startDate',
                        type=parse_string_date,
                        default=(datetime.now() - timedelta(days=30)).replace(
                            hour=0, minute=0, second=0, microsecond=0),
                        help='Start date.')
    parser.add_argument('--endDate',
                        dest='endDate',
                        type=parse_string_date,
                        default=datetime.now().replace(hour=23,
                                                       minute=59,
                                                       second=59,
                                                       microsecond=999999),
                        help='End date.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    pg = PostgresDb()
    pipeline_options = PipelineOptions(pipeline_args)

    # clean dumped files
    pg.truncate_table_by_delete(
        table_name='stage.tmp_items_mean_purchase_time')

    time_boundaries_list = collection_range_timestamps(
        startDate=known_args.startDate,
        endDate=known_args.endDate,
        delta=timedelta(hours=1),
        return_as_list=True)

    #time_boundaries_list = collection_range_timestamps(startDate=datetime(2019, 1, 29, 0, 0, 0),
    #                                                   endDate=datetime(2019, 1, 29, 1, 0, 0),
    #                                                   delta=timedelta(hours=1),
    #                                                   return_as_list=True)

    # perform source data
    for time_boundaries_bulk_list in list_chunks(time_boundaries_list, 15):
        with beam.Pipeline(options=pipeline_options) as p:
            t_boundaries_sources = (p
                                    | 'next_time_boundaries_bulk' >>
                                    beam.Create(time_boundaries_bulk_list))

            purch_log_data = (
                t_boundaries_sources
                | 'sql_prepare_purchase_log' >> beam.ParDo(DoFnQueryPurchLog())
                | 'sql_execute_purchase_log' >> beam.ParDo(
                    DoFnExecuteSql(table_tag='purchase_log', pg_db=pg)))

            matches_players_data = (
                t_boundaries_sources
                | 'sql_prepare_matches_players' >> beam.ParDo(
                    DoFnQueryMatchesPlayers())
                | 'sql_execute_matches_players' >> beam.ParDo(
                    DoFnExecuteSql(table_tag='matches_players', pg_db=pg)))

            ({
                'purch_log': purch_log_data,
                'matches_players': matches_players_data
            }
             | 'group_by_match_id_player_num' >> beam.CoGroupByKey()
             | 'retrieve_heroes_items_purch_times' >> beam.ParDo(
                 DoFnRetrieveHeroesItemsPurchTime())
             | 'group_by_heroes' >> GroupByKey()
             | 'dump_stat_source' >> beam.ParDo(DoFnDumpStatSource(pg_db=pg)))

    with beam.Pipeline(options=pipeline_options) as p:
        (p
         | 'heroes_collection' >> collection_heroes()
         | 'calculate_purch_statistics' >> beam.ParDo(
             DoFnCalculatePurchStatistics(pg_db=pg)))