def test_rewindow_regroup(self): with TestPipeline() as p: grouped = (p | Create(range(5)) | Map(lambda t: TimestampedValue(('key', t), t)) | 'window' >> WindowInto(FixedWindows(5, offset=3)) | GroupByKey() | MapTuple(lambda k, vs: (k, sorted(vs)))) # Both of these group-and-ungroup sequences should be idempotent. regrouped1 = (grouped | 'w1' >> WindowInto(FixedWindows(5, offset=3)) | 'g1' >> GroupByKey() | FlatMapTuple(lambda k, vs: [(k, v) for v in vs])) regrouped2 = (grouped | FlatMapTuple(lambda k, vs: [(k, v) for v in vs]) | 'w2' >> WindowInto(FixedWindows(5, offset=3)) | 'g2' >> GroupByKey() | MapTuple(lambda k, vs: (k, sorted(vs)))) with_windows = Map(lambda e, w=beam.DoFn.WindowParam: (e, w)) expected = [(('key', [0, 1, 2]), IntervalWindow(-2, 3)), (('key', [3, 4]), IntervalWindow(3, 8))] assert_that(grouped | 'ww' >> with_windows, equal_to(expected)) assert_that( regrouped1 | 'ww1' >> with_windows, equal_to(expected), label='r1') assert_that( regrouped2 | 'ww2' >> with_windows, equal_to(expected), label='r2')
def test_window_assignment_through_multiple_gbk_idempotency(self): with TestPipeline() as p: pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4) result = (pcoll | 'window' >> WindowInto(FixedWindows(2)) | 'gbk' >> GroupByKey() | 'same window' >> WindowInto(FixedWindows(2)) | 'another gbk' >> GroupByKey() | 'same window again' >> WindowInto(FixedWindows(2)) | 'gbk again' >> GroupByKey()) assert_that(result, equal_to([('key', [[[0]]]), ('key', [[[2]]]), ('key', [[[4]]])]))
def expand(self, pcoll): # This is a composite transform involves the following: # 1. Create a singleton of the user provided `query` and apply a ``ParDo`` # that splits the query into `num_splits` and assign each split query a # unique `int` as the key. The resulting output is of the type # ``PCollection[(int, Query)]``. # # If the value of `num_splits` is less than or equal to 0, then the # number of splits will be computed dynamically based on the size of the # data for the `query`. # # 2. The resulting ``PCollection`` is sharded using a ``GroupByKey`` # operation. The queries are extracted from the (int, Iterable[Query]) and # flattened to output a ``PCollection[Query]``. # # 3. In the third step, a ``ParDo`` reads entities for each query and # outputs a ``PCollection[Entity]``. queries = (pcoll.pipeline | 'UserQuery' >> Create([self._query]) | 'SplitQuery' >> ParDo( ReadFromDatastore.SplitQueryFn( self._project, self._query, self._datastore_namespace, self._num_splits))) sharded_queries = (queries | GroupByKey() | Values() | 'Flatten' >> FlatMap(lambda x: x)) entities = sharded_queries | 'Read' >> ParDo( ReadFromDatastore.ReadFn(self._project, self._datastore_namespace)) return entities
def test_timestamped_value(self): with TestPipeline() as p: result = (p | 'start' >> Create([(k, k) for k in range(10)]) | Map(lambda x_t: TimestampedValue(x_t[0], x_t[1])) | 'w' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])]))
def test_sliding_windows(self): with TestPipeline() as p: pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) result = (pcoll | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) | GroupByKey() | reify_windows) expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), ('key @ [2.0, 6.0)', [2, 3])] assert_that(result, equal_to(expected))
def test_window_assignment_idempotency(self): with TestPipeline() as p: pcoll = self.timestamped_key_values(p, 'key', 0, 2, 4) result = (pcoll | 'window' >> WindowInto(FixedWindows(2)) | 'same window' >> WindowInto(FixedWindows(2)) | 'same window again' >> WindowInto(FixedWindows(2)) | GroupByKey()) assert_that(result, equal_to([('key', [0]), ('key', [2]), ('key', [4])]))
def test_sessions(self): with TestPipeline() as p: pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) result = (pcoll | 'w' >> WindowInto(Sessions(10)) | GroupByKey() | sort_values | reify_windows) expected = [('key @ [1.0, 13.0)', [1, 2, 3]), ('key @ [20.0, 45.0)', [20, 27, 35])] assert_that(result, equal_to(expected))
def test_custom_windows(self): with TestPipeline() as p: pcoll = self.timestamped_key_values(p, 'key', 0, 1, 2, 3, 4, 5, 6) # pylint: disable=abstract-class-instantiated result = (pcoll | 'custom window' >> WindowInto(TestCustomWindows()) | GroupByKey() | 'sort values' >> MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that(result, equal_to([('key', [0, 1, 2]), ('key', [3, 4]), ('key', [5]), ('key', [6])]))
def test_rewindow(self): with TestPipeline() as p: result = (p | Create([(k, k) for k in range(10)]) | Map(lambda x_t1: TimestampedValue(x_t1[0], x_t1[1])) | 'window' >> WindowInto(SlidingWindows(period=2, size=6)) # Per the model, each element is now duplicated across # three windows. Rewindowing must preserve this duplication. | 'rewindow' >> WindowInto(FixedWindows(5)) | 'rewindow2' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)), ('key', sorted([5, 6, 7, 8, 9] * 3))]))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--startDate', dest='startDate', type = parse_string_date, default=(datetime.now() - timedelta(days=1)).replace( hour=0, minute=0, second=0, microsecond=0), help='Start date.') parser.add_argument('--endDate', dest='endDate', type=parse_string_date, default=datetime.now().replace( hour=23, minute=59, second=59, microsecond=999999), help='End date.') known_args, pipeline_args = parser.parse_known_args(argv) time_boundaries_list = collection_range_timestamps(startDate=known_args.startDate, endDate=known_args.endDate, delta=timedelta(hours=1), return_as_list=True) # time_boundaries_list = collection_range_timestamps(startDate=datetime(2019, 1, 29, 0, 0, 0), # endDate=datetime(2019, 1, 29, 1, 0, 0), # delta=timedelta(hours=1), # return_as_list=True) pg = PostgresDb() # prepare pipelines and collections pipeline_options = PipelineOptions(pipeline_args=pipeline_args) heroes = get_heroes() # perform source data for time_boundaries_bulk_list in list_chunks(time_boundaries_list, 10): with beam.Pipeline(options=pipeline_options) as p: t_boundaries_sources = (p | 'next_time_boundaries_bulk' >> beam.Create(time_boundaries_bulk_list)) (t_boundaries_sources | 'sql_prepare_matches_players' >> beam.ParDo(DoFnBuidQueryMatchesPlayers()) | 'sql_execute_matches_players' >> beam.ParDo(DoFnExecuteSql(table_tag='matches_players', pg_db=pg)) | 'matches_group_by_match_id' >> GroupByKey() | 'enrich_and_split_and_save' >> beam.ParDo(DoFnEnrichSplitByHeroes()) | 'matches_group_by_hero_id' >> GroupByKey() | 'update_hero_impact' >> beam.ParDo(DoFnDBUpdateHeroImpact(pg_db=pg)) ) # aggregations with beam.Pipeline(options=pipeline_options) as p: (p | 'last_patches' >> collection_patches(pg_db=pg) | 'kv_patches_heroes' >> beam.ParDo(DoFnKVHeroesPatches(heroes=heroes)) | 'patches_group_by_hero_id' >> GroupByKey() | 'sql_heroes_patches' >> beam.ParDo(DoFnHeroesPatchData(pg_db=pg)) | 'heroes_patches_group_by_key' >> GroupByKey() | 'calc_total_impact' >> beam.ParDo(DoFnCalcTotalImpact(pg_db=pg)) | 'aggregate_tuples' >> GroupByKey() | 'update_hero_patch_impact' >> beam.ParDo(DoFnDBUpdateHeroPatchImpact(pg_db=pg)) )
def run(argv=None): # TODO: DROP indexes on purch log in DB parser = argparse.ArgumentParser() parser.add_argument('--startDate', dest='startDate', type=parse_string_date, default=(datetime.now() - timedelta(days=30)).replace( hour=0, minute=0, second=0, microsecond=0), help='Start date.') parser.add_argument('--endDate', dest='endDate', type=parse_string_date, default=datetime.now().replace(hour=23, minute=59, second=59, microsecond=999999), help='End date.') known_args, pipeline_args = parser.parse_known_args(argv) pg = PostgresDb() pipeline_options = PipelineOptions(pipeline_args) # clean dumped files pg.truncate_table_by_delete( table_name='stage.tmp_items_mean_purchase_time') time_boundaries_list = collection_range_timestamps( startDate=known_args.startDate, endDate=known_args.endDate, delta=timedelta(hours=1), return_as_list=True) #time_boundaries_list = collection_range_timestamps(startDate=datetime(2019, 1, 29, 0, 0, 0), # endDate=datetime(2019, 1, 29, 1, 0, 0), # delta=timedelta(hours=1), # return_as_list=True) # perform source data for time_boundaries_bulk_list in list_chunks(time_boundaries_list, 15): with beam.Pipeline(options=pipeline_options) as p: t_boundaries_sources = (p | 'next_time_boundaries_bulk' >> beam.Create(time_boundaries_bulk_list)) purch_log_data = ( t_boundaries_sources | 'sql_prepare_purchase_log' >> beam.ParDo(DoFnQueryPurchLog()) | 'sql_execute_purchase_log' >> beam.ParDo( DoFnExecuteSql(table_tag='purchase_log', pg_db=pg))) matches_players_data = ( t_boundaries_sources | 'sql_prepare_matches_players' >> beam.ParDo( DoFnQueryMatchesPlayers()) | 'sql_execute_matches_players' >> beam.ParDo( DoFnExecuteSql(table_tag='matches_players', pg_db=pg))) ({ 'purch_log': purch_log_data, 'matches_players': matches_players_data } | 'group_by_match_id_player_num' >> beam.CoGroupByKey() | 'retrieve_heroes_items_purch_times' >> beam.ParDo( DoFnRetrieveHeroesItemsPurchTime()) | 'group_by_heroes' >> GroupByKey() | 'dump_stat_source' >> beam.ParDo(DoFnDumpStatSource(pg_db=pg))) with beam.Pipeline(options=pipeline_options) as p: (p | 'heroes_collection' >> collection_heroes() | 'calculate_purch_statistics' >> beam.ParDo( DoFnCalculatePurchStatistics(pg_db=pg)))