def ComputeLongestWord(input): return (input | "Tokenize" >> beam.FlatMap(lambda line: re.findall(r'[A-Za-z\']+', line)) | beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=window.Duration.of(0)) | "MaxLength" >> beam.combiners.Top.Of(1, key=len).without_defaults() | "Flatten" >> beam.FlatMap(lambda x: x))
def test_setting_global_window(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 11, 16, 27]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_global_window] from apache_beam import window session_windowed_items = ( items | 'window' >> beam.WindowInto(window.GlobalWindows())) # [END setting_global_window] summed = (session_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([56])) p.run()
def SportTrackerCalc(input): import math EARTH_DIAMETER = 6_371_000 # meters def calculateDelta(deltaLatLon): return math.sqrt(2 * (1 - math.cos(deltaLatLon))) def distance(p1, p2): deltaLatitude = (p1[0] - p2[0]) * math.pi / 180 deltaLongitude = (p1[1] - p2[1]) * math.pi / 180 latitudeIncInMeters = calculateDelta(deltaLatitude) longitudeIncInMeters = calculateDelta(deltaLongitude) return EARTH_DIAMETER * math.sqrt( latitudeIncInMeters * latitudeIncInMeters + longitudeIncInMeters * longitudeIncInMeters) def computeMetrics(key, trackPositions): last = None totalTime = 0 totalDistance = 0 for p in sorted(trackPositions, key=lambda x: x[2]): if last != None: totalDistance += distance(last, p) totalTime += p[2] - last[2] last = p return (key, totalTime, totalDistance) return (input | beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterWatermark( early=trigger.AfterProcessingTime(10)), timestamp_combiner=window.TimestampCombiner.OUTPUT_AT_LATEST, accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=window.Duration.of(0)) | "GroupByWorkout" >> beam.GroupByKey() | "ComputeMetrics" >> beam.Map(lambda x: computeMetrics(*x)))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://loganalysis/error_log.txt', help='Input file to process.') parser.add_argument('--output', dest='output', default='gs://loganalysis/output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DirectRunner', '--project=springmldemoproject', '--staging_location=gs://loganalysis/staging', '--temp_location=gs://loganalysis/temp', '--job_name=log-job', ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input) counts = ( lines | 'window' >> beam.WindowInto(window.GlobalWindows()) | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)', x)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) def format_result(time_count): (time, count) = time_count return '%s: %s' % (time, count) output = counts | 'Format' >> beam.Map(format_result) output | WriteToText(known_args.output)
def usage(): sys.stderr.write("Usage: %s <input_file>\n" % (sys.argv[0], )) sys.exit(1) def readToStream(filename): lines = map(lambda line: line.strip(), open(filename, "r").read().split("\n")) return TestStream() \ .add_elements(lines) \ .advance_watermark_to_infinity() if len(sys.argv) < 2: usage() input_file = sys.argv[1] with beam.Pipeline(options=PipelineOptions(["--streaming"])) as p: (p | readToStream(input_file) | beam.WindowInto(window.GlobalWindows(), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=window.Duration.of(0)) | "Tokenize" >> beam.FlatMap(lambda line: re.findall(r'[A-Za-z\']+', line)) | "CountWords" >> beam.combiners.Count.PerElement() | "PrintOutput" >> beam.ParDo(lambda c: print(c)))
from datetime import datetime file_in = 'tags.csv' skip_head = "userId,movieId,tag,timestamp" class ParseNewMovies(beam.DoFn): def process(self,element): if(element!= skip_head): z = element.split(",") y=int(z[3]) i = datetime.utcfromtimestamp(y) x = i.strftime('%Y-%m-%d %H:%M:%S') yield z[2],(z[1],x) with beam.Pipeline() as pipeline: item = ( pipeline | 'Read lines' >> beam.io.ReadFromText(file_in) | 'Par D1' >> beam.ParDo(ParseNewMovies()) ) x = ( item | 'Par D3' >> beam.WindowInto(window.GlobalWindows()) | 'Par D2' >> beam.combiners.Count.PerKey() | 'Par D4' >> beam.Map(print) )
def ComputeBoxedMetrics(input, duration): return (input | beam.WindowInto(window.GlobalWindows()) | beam.ParDo(ToMetricFn(duration)))