def test_model_other_composite_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements( ['a', 'a']).add_elements( ['a', 'b', 'b']).advance_processing_time(60).add_elements(['a'] * 100)) pcollection = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1))) counts = ( # [START model_other_composite_triggers] pcollection | WindowInto( FixedWindows(1 * 60), trigger=Repeatedly( AfterAny(AfterCount(100), AfterProcessingTime(1 * 60))), accumulation_mode=AccumulationMode.DISCARDING) # [END model_other_composite_triggers] | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 100)]))
def test_fixed_after_first(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_trigger_encoding(self): for trigger_fn in (DefaultTrigger(), AfterAll(AfterCount(1), AfterCount(10)), AfterAny(AfterCount(10), AfterCount(100)), AfterWatermark(early=AfterCount(1000)), AfterWatermark(early=AfterCount(1000), late=AfterCount(1)), Repeatedly(AfterCount(100)), trigger.OrFinally(AfterCount(3), AfterCount(10))): context = pipeline_context.PipelineContext() self.assertEqual( trigger_fn, TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
def test_repeatedly_after_first(self): self.run_trigger_simple( FixedWindows(100), # pyformat break Repeatedly(AfterAny(AfterCount(3), AfterWatermark())), AccumulationMode.ACCUMULATING, zip(range(7), 'abcdefg'), {IntervalWindow(0, 100): [ set('abc'), set('abcdef'), set('abcdefg'), set('abcdefgx'), set('abcdefgxy'), set('abcdefgxyz')]}, 1, late_data=zip(range(3), 'xyz'))
def run(argv=None): from apache_beam.transforms.window import TimestampedValue, FixedWindows pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images' with beam.Pipeline(options=get_pipeline_options()) as pipeline: logging.info("pubsub_input_topic = {}".format(pubsub_input_topic)) json_messages = \ (pipeline | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes) | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message) ) window_size_s = 30 allowed_lateness_s = 60 high_confidence_faces_grouped_by_emotion_count_per_window = ( json_messages | 'ParseJsonMessage' >> beam.Map(parse_jsons) | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn()) | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood) | 'UseCustomTimestamp' >> beam.Map(lambda face_info: TimestampedValue(face_info, face_info['ts_seconds'])) | 'WindowFaceInfo' >> beam.WindowInto( FixedWindows(window_size_s, 0), trigger=AfterWatermark( early=AfterAny(AfterCount(5), AfterProcessingTime(10)), late=AfterAll(AfterCount(2), AfterProcessingTime(20))), allowed_lateness=allowed_lateness_s, accumulation_mode=AccumulationMode.DISCARDING) | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info)) | 'GroupByEmotion' >> beam.GroupByKey() | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow()) ) log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery") high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( bq_faces_windowed_table_name, schema={"fields": bq_faces_windowed_table_schema}, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) pipeline_result = pipeline.run() pipeline_result.wait_until_finish()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
return [(Store_id, Store_location, Product_id, Product_category, sold_unit, buy_rate, sell_price, profit, transaction_date)] #############Create Pipeline ########### stream_data = ( p | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=inputs_pattern) | 'Remove space in the Data ' >> beam.Map(lambda row: row.lstrip().rstrip()) | 'Split Data ' >> beam.Map(lambda row: row.decode().split(',')) | 'Calculate Profit' >> beam.Map(calculateProfit) | 'Apply custom timestamp' >> beam.Map(custom_timestamp) | 'Make Key value' >> beam.Map(lambda row: (row[:-2], row[-1])) | 'Set Fixed Window of 30 sec' >> beam.WindowInto( window.FixedWindows(30), trigger=Repeatedly(AfterAny(AfterCount(5), AfterProcessingTime(10))), accumulation_mode=AccumulationMode.DISCARDING) | 'Combine Result of 30 Sec' >> beam.CombinePerKey(sum) | 'Format result and append time' >> beam.ParDo(BuildRecordFn()) | 'Prepare data for BigQuery' >> beam.Map(covert_to_dict) #|'Write to Text'>>beam.io.WriteToText(outputs_prefix) | 'Write to BigQuery' >> beam.io.WriteToBigQuery( table='sales', dataset='beam', project='beam-290211')) p.run().wait_until_finish() if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) run()
def test_after_any_different_reasons(self): self._test( AfterAny(Repeatedly(AfterCount(2)), AfterProcessingTime()), 0, DataLossReason.MAY_FINISH | DataLossReason.CONDITION_NOT_GUARANTEED)
def test_after_any_same_reason(self): self._test(AfterAny(AfterCount(1), AfterProcessingTime()), 0, DataLossReason.MAY_FINISH)
def test_after_any_some_unsafe(self): self._test(AfterAny(AfterCount(1), DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_any_all_safe(self): self._test(AfterAny(Repeatedly(AfterCount(42)), DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_any_one_may_finish(self): self._test(AfterAny(AfterCount(42), DefaultTrigger()), 0, DataLossReason.MAY_FINISH)