def expand(self, pcoll): return ( pcoll | 'Filter Relevant Events' >> beam.Filter(self.filter_event) | 'Key by ID and Environment' >> beam.Map(tupleByIdAndEnvironment) | 'Window {}'.format(self.session_window) >> beam.WindowInto(window.Sessions(self.session_window)) | beam.GroupByKey() | 'Calculate Time Between' >> beam.FlatMap(self.calculate_time) )
def analyze(args, opts): "Core of the pipeline here." with beam.Pipeline(options=opts) as p: lines = p | ReadFromText(args.input, coder=JsonCoder()) output = (lines | beam.Map(lambda x: (x['trace'], x)) | beam.WindowInto(window.Sessions(10)) | beam.GroupByKey() | beam.ParDo(AssembleTrace())) output | WriteToText(args.output)
def test_setting_session_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 11, 16, 27]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_session_windows] from apache_beam import window session_windowed_items = ( items | 'window' >> beam.WindowInto(window.Sessions(10))) # [END setting_session_windows] summed = (session_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([29, 27])) p.run()
from datetime import datetime file_in = 'tags.csv' skip_head = "userId,movieId,tag,timestamp" class ParseNewMovies(beam.DoFn): def process(self,element): if(element!= skip_head): z = element.split(",") y=int(z[3]) i = datetime.utcfromtimestamp(y) x = i.strftime('%Y-%m-%d %H:%M:%S') yield z[2],(z[1],x) with beam.Pipeline() as pipeline: item = ( pipeline | 'Read lines' >> beam.io.ReadFromText(file_in) | 'Par D1' >> beam.ParDo(ParseNewMovies()) ) x = ( item | 'Par D3' >> beam.WindowInto(window.Sessions(10 * 60)) | 'Par D2' >> beam.combiners.Count.PerKey() | 'Par D4' >> beam.Map(print) )
def expand(self, pcoll): return (pcoll | 'ComputeSessionsWindow' >> beam.WindowInto( window.Sessions(gap_size=ONE_HOUR_IN_SECONDS)) | combiners.Count.PerElement())
def main(argv=None): def json_parser(x): parsed = json.loads(x) return parsed def bye(x): logging.info('outing: %s', x) return x parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output_topic") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) data = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes) | "JSONParse" >> beam.Map(json_parser)) (data | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"]) | "Windowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))), accumulation_mode=tr.AccumulationMode.DISCARDING, allowed_lateness=0) | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye' >> beam.Map(bye) | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC)) (data | "SlidWindowing" >> beam.WindowInto( window.FixedWindows(60), trigger=(tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))), late=tr.Repeatedly(tr.AfterCount(1)))), allowed_lateness=300, accumulation_mode=tr.AccumulationMode.ACCUMULATING) | "Extract" >> beam.Map(lambda x: x["meter_increment"]) | "Sum_up" >> beam.CombineGlobally(sum).without_defaults() | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x}) | "Enrich with time data" >> beam.ParDo(Enrich()) | "ToBytesCount" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye2' >> beam.Map(bye) | "WriteCount" >> beam.io.WriteToPubSub(TOPIC)) (data | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"]) | "SessionWindowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))), accumulation_mode=tr.AccumulationMode.ACCUMULATING, allowed_lateness=0) | "GroupInPickup" >> beam.CombinePerKey(PickupFn()) | "Discarding Key" >> beam.Map(lambda x: x[1]) | "Filter not pickup" >> beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None) | "ToBytesPickup" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye3' >> beam.Map(bye) | "WritePickup" >> beam.io.WriteToPubSub(TOPIC)) result = p.run() result.wait_until_finish()