Esempio n. 1
0
 def expand(self, pcoll):
   return (
     pcoll
       | 'Filter Relevant Events' >> beam.Filter(self.filter_event)
       | 'Key by ID and Environment' >> beam.Map(tupleByIdAndEnvironment)
       | 'Window {}'.format(self.session_window) >> beam.WindowInto(window.Sessions(self.session_window))
       | beam.GroupByKey()
       | 'Calculate Time Between' >> beam.FlatMap(self.calculate_time)
   )
Esempio n. 2
0
def analyze(args, opts):
    "Core of the pipeline here."
    with beam.Pipeline(options=opts) as p:
        lines = p | ReadFromText(args.input, coder=JsonCoder())
        output = (lines
                  | beam.Map(lambda x: (x['trace'], x))
                  | beam.WindowInto(window.Sessions(10))
                  | beam.GroupByKey()
                  | beam.ParDo(AssembleTrace()))
        output | WriteToText(args.output)
Esempio n. 3
0
 def test_setting_session_windows(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 11, 16, 27])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_session_windows]
     from apache_beam import window
     session_windowed_items = (
         items | 'window' >> beam.WindowInto(window.Sessions(10)))
     # [END setting_session_windows]
     summed = (session_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed, beam.equal_to([29, 27]))
     p.run()
from datetime import datetime

file_in = 'tags.csv'
skip_head = "userId,movieId,tag,timestamp"


class ParseNewMovies(beam.DoFn):
    def process(self,element):
        if(element!= skip_head):
            z = element.split(",")
            y=int(z[3])
            i = datetime.utcfromtimestamp(y)
            x = i.strftime('%Y-%m-%d %H:%M:%S')
            yield z[2],(z[1],x)

with beam.Pipeline() as pipeline:
    item = (

            pipeline
            | 'Read lines' >> beam.io.ReadFromText(file_in)
            | 'Par D1' >> beam.ParDo(ParseNewMovies())


    )
    x = (
        item | 'Par D3' >>  beam.WindowInto(window.Sessions(10 * 60))
             | 'Par D2' >>  beam.combiners.Count.PerKey()
             | 'Par D4' >>  beam.Map(print)
    )

Esempio n. 5
0
 def expand(self, pcoll):
     return (pcoll
             | 'ComputeSessionsWindow' >> beam.WindowInto(
                 window.Sessions(gap_size=ONE_HOUR_IN_SECONDS))
             | combiners.Count.PerElement())
def main(argv=None):
    def json_parser(x):
        parsed = json.loads(x)
        return parsed

    def bye(x):
        logging.info('outing: %s', x)
        return x

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_topic")
    parser.add_argument("--output_topic")
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    data = (p
            | 'ReadData' >>
            beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes)
            | "JSONParse" >> beam.Map(json_parser))

    (data
     | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"])
     | "Windowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))),
         accumulation_mode=tr.AccumulationMode.DISCARDING,
         allowed_lateness=0)
     | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye' >> beam.Map(bye)
     | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "SlidWindowing" >> beam.WindowInto(
         window.FixedWindows(60),
         trigger=(tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))),
                                    late=tr.Repeatedly(tr.AfterCount(1)))),
         allowed_lateness=300,
         accumulation_mode=tr.AccumulationMode.ACCUMULATING)
     | "Extract" >> beam.Map(lambda x: x["meter_increment"])
     | "Sum_up" >> beam.CombineGlobally(sum).without_defaults()
     | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x})
     | "Enrich with time data" >> beam.ParDo(Enrich())
     | "ToBytesCount" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye2' >> beam.Map(bye)
     | "WriteCount" >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"])
     | "SessionWindowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))),
         accumulation_mode=tr.AccumulationMode.ACCUMULATING,
         allowed_lateness=0)
     | "GroupInPickup" >> beam.CombinePerKey(PickupFn())
     | "Discarding Key" >> beam.Map(lambda x: x[1])
     | "Filter not pickup" >>
     beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None)
     | "ToBytesPickup" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye3' >> beam.Map(bye)
     | "WritePickup" >> beam.io.WriteToPubSub(TOPIC))

    result = p.run()
    result.wait_until_finish()