Example #1
0
def ComputeLongestWord(input):
  return (input
      | "Tokenize" >> beam.FlatMap(lambda line: re.findall(r'[A-Za-z\']+', line))
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
          accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
          allowed_lateness=window.Duration.of(0))
      | "MaxLength" >> beam.combiners.Top.Of(1, key=len).without_defaults()
      | "Flatten" >> beam.FlatMap(lambda x: x))
Example #2
0
 def test_setting_global_window(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 11, 16, 27])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_global_window]
     from apache_beam import window
     session_windowed_items = (
         items | 'window' >> beam.WindowInto(window.GlobalWindows()))
     # [END setting_global_window]
     summed = (session_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed, beam.equal_to([56]))
     p.run()
def SportTrackerCalc(input):

    import math

    EARTH_DIAMETER = 6_371_000  # meters

    def calculateDelta(deltaLatLon):
        return math.sqrt(2 * (1 - math.cos(deltaLatLon)))

    def distance(p1, p2):
        deltaLatitude = (p1[0] - p2[0]) * math.pi / 180
        deltaLongitude = (p1[1] - p2[1]) * math.pi / 180
        latitudeIncInMeters = calculateDelta(deltaLatitude)
        longitudeIncInMeters = calculateDelta(deltaLongitude)
        return EARTH_DIAMETER * math.sqrt(
            latitudeIncInMeters * latitudeIncInMeters +
            longitudeIncInMeters * longitudeIncInMeters)

    def computeMetrics(key, trackPositions):
        last = None
        totalTime = 0
        totalDistance = 0
        for p in sorted(trackPositions, key=lambda x: x[2]):
            if last != None:
                totalDistance += distance(last, p)
                totalTime += p[2] - last[2]
            last = p
        return (key, totalTime, totalDistance)

    return (input
            | beam.WindowInto(
                window.GlobalWindows(),
                trigger=trigger.AfterWatermark(
                    early=trigger.AfterProcessingTime(10)),
                timestamp_combiner=window.TimestampCombiner.OUTPUT_AT_LATEST,
                accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                allowed_lateness=window.Duration.of(0))
            | "GroupByWorkout" >> beam.GroupByKey()
            | "ComputeMetrics" >> beam.Map(lambda x: computeMetrics(*x)))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://loganalysis/error_log.txt',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        default='gs://loganalysis/output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner=DirectRunner',
        '--project=springmldemoproject',
        '--staging_location=gs://loganalysis/staging',
        '--temp_location=gs://loganalysis/temp',
        '--job_name=log-job',
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(known_args.input)
        counts = (
            lines
            | 'window' >> beam.WindowInto(window.GlobalWindows())
            | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                r'((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)',
                x)).with_output_types(unicode))
            | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
            | 'GroupAndSum' >> beam.CombinePerKey(sum))

        def format_result(time_count):
            (time, count) = time_count
            return '%s: %s' % (time, count)

        output = counts | 'Format' >> beam.Map(format_result)

        output | WriteToText(known_args.output)
Example #5
0

def usage():
    sys.stderr.write("Usage: %s <input_file>\n" % (sys.argv[0], ))
    sys.exit(1)


def readToStream(filename):
    lines = map(lambda line: line.strip(),
                open(filename, "r").read().split("\n"))
    return TestStream() \
        .add_elements(lines) \
        .advance_watermark_to_infinity()


if len(sys.argv) < 2:
    usage()

input_file = sys.argv[1]

with beam.Pipeline(options=PipelineOptions(["--streaming"])) as p:
    (p | readToStream(input_file)
     | beam.WindowInto(window.GlobalWindows(),
                       trigger=trigger.AfterWatermark(),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING,
                       allowed_lateness=window.Duration.of(0))
     |
     "Tokenize" >> beam.FlatMap(lambda line: re.findall(r'[A-Za-z\']+', line))
     | "CountWords" >> beam.combiners.Count.PerElement()
     | "PrintOutput" >> beam.ParDo(lambda c: print(c)))
from datetime import datetime

file_in = 'tags.csv'
skip_head = "userId,movieId,tag,timestamp"


class ParseNewMovies(beam.DoFn):
    def process(self,element):
        if(element!= skip_head):
            z = element.split(",")
            y=int(z[3])
            i = datetime.utcfromtimestamp(y)
            x = i.strftime('%Y-%m-%d %H:%M:%S')
            yield z[2],(z[1],x)

with beam.Pipeline() as pipeline:
    item = (

            pipeline
            | 'Read lines' >> beam.io.ReadFromText(file_in)
            | 'Par D1' >> beam.ParDo(ParseNewMovies())


    )
    x = (
        item | 'Par D3' >>  beam.WindowInto(window.GlobalWindows())
             | 'Par D2' >>  beam.combiners.Count.PerKey()
             | 'Par D4' >>  beam.Map(print)
    )

def ComputeBoxedMetrics(input, duration):
    return (input
            | beam.WindowInto(window.GlobalWindows())
            | beam.ParDo(ToMetricFn(duration)))