def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic',
                        dest='topic',
                        default=default_topic)
    parser.add_argument('--bucket',
                        dest='bucket',
                        default=default_bucket)

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job'])
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    class DiffOutputsFn(beam.DoFn):
        # These tags will be used to tag the outputs of this DoFn.
        OUTPUT_TAG_BUY = 'buy'
        OUTPUT_TAG_SELL = 'sell'
        OUTPUT_TAG_ERROR = 'error'

        def process(self, element):
            dictionary = yaml.load(element)
            dictionary['timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            if dictionary['type'] == 'buy':
                dictionary.pop('type')
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_BUY, dictionary)
            elif dictionary['type'] == 'sell':
                dictionary.pop('type')
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_SELL, dictionary)
            else:
                # we don't drop the key here, since we want to know where the mistake was
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_ERROR, dictionary)

    def string_join(elements):
        string = str(elements)
        return string.replace('},', '};')

    with beam.Pipeline(options=pipeline_options) as p:

        diff_outputs = (p | "ReadTopic" >>
                        beam.io.ReadFromPubSub(topic=known_args.topic) |
                        "SplitOutputs" >> beam.ParDo(DiffOutputsFn()).with_outputs(
                    DiffOutputsFn.OUTPUT_TAG_BUY,
                    DiffOutputsFn.OUTPUT_TAG_SELL,
                    DiffOutputsFn.OUTPUT_TAG_ERROR))

        buy = (diff_outputs.buy | "WindowBuy" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH))
               | "CombineBuy" >> beam.CombineGlobally(string_join).without_defaults()
               | "WriteToGCSBuy" >> WriteToText(file_path_prefix=known_args.bucket + 'buy/'))

        sell = (diff_outputs.sell | "WindowSell" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH))
                | "CombineSell" >> beam.CombineGlobally(string_join).without_defaults()
                | "WriteToGCSSell" >> WriteToText(file_path_prefix=known_args.bucket + 'sell/'))

        # We want to know what 'type' gave the error, so no ParDo here
        error = (diff_outputs.error | "WindowError" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH))
                 | "CombineError" >> beam.CombineGlobally(string_join).without_defaults()
                 | "WriteToGCSError" >> WriteToText(file_path_prefix=known_args.bucket + 'error/'))
Exemple #2
0
def run(argv=None):
    parser = argparse.ArgumentParser()

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project={}'.format(project)])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    class DiffOutputsFn(beam.DoFn):
        # These tags will be used to tag the outputs of this DoFn.
        OUTPUT_TAG_CS_BOOKINGS = 'tag_cs_bookings'
        OUTPUT_TAG_USERS = 'tag_users'
        from apache_beam import pvalue

        def process(self, element):
            #Receives a single element (a line) and produces words and character
            #counts.
            if 'vehicle_id' in element:
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_CS_BOOKINGS, element)
            else:
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_USERS, element)

    def string_join(elements):
        return str('\n'.join(elements))

    with beam.Pipeline(options=pipeline_options) as p:

        diff_outputs = (
            p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=default_topic)
            | "SplitOutputs" >> beam.ParDo(DiffOutputsFn()).with_outputs(
                DiffOutputsFn.OUTPUT_TAG_CS_BOOKINGS,
                DiffOutputsFn.OUTPUT_TAG_USERS))
        cs_bookings_p = (diff_outputs.tag_cs_bookings
                         | "Windowing_cs_bookings" >> beam.WindowInto(
                             window.FixedWindows(WINDOW_LENGTH))
                         | "Combine_cs_bookings" >>
                         beam.CombineGlobally(string_join).without_defaults()
                         | "WriteGCSCommon_cs_bookings" >> WriteToText(
                             file_path_prefix=default_bucket + 'cs_bookings/',
                             file_name_suffix='cs_booking_YYYYMMDDHH'))

        users_p = (diff_outputs.tag_users
                   | "Windowing_users" >> beam.WindowInto(
                       window.FixedWindows(WINDOW_LENGTH))
                   | "Combine_users" >>
                   beam.CombineGlobally(string_join).without_defaults()
                   | "WriteGCSCommon_users" >> WriteToText(
                       file_path_prefix=default_bucket + 'users/',
                       file_name_suffix='users_YYYYMMDDHH'))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic', dest='topic', default=default_topic)
    parser.add_argument('--bucket', dest='bucket', default=default_bucket)

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--project={}'.format(project), '--streaming',
        '--experiments=allow_non_updatable_job'
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    def add_key(element):
        # makes the string a dict and returns the key and the rest of the dict
        parsed_dict = yaml.load(element)
        key = parsed_dict[GROUP_BY_KEY]
        parsed_dict.pop(GROUP_BY_KEY)
        return (key, parsed_dict)

    with beam.Pipeline(options=pipeline_options) as p:
        (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic)
         | "AddKey" >> beam.Map(lambda element: add_key(element))
         | "Window" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH))
         | "GroupByKey" >> beam.GroupByKey()
         | "WriteToGCS" >> WriteToText(known_args.bucket))
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_topic")
    parser.add_argument("--output")
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    csv_lines = (p
                 | 'ReadData' >>
                 beam.io.ReadFromPubSub(topic=TOPIC).with_output_types(bytes)
                 | "Decode" >> beam.Map(lambda x: x.decode('utf-8'))
                 | "Clean Data" >> beam.Map(regex_clean)
                 | 'ParseCSV' >> beam.ParDo(Split()))

    table1 = (csv_lines
              | 'WriteToBigQuery1' >> beam.io.WriteToBigQuery(
                  'my-gce-project1:china.POC_BEAM',
                  schema=schema,
                  write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    table2 = (csv_lines
              | 'Collect services' >> beam.ParDo(collectservices())
              | 'window' >> beam.WindowInto(window.FixedWindows(30))
              | 'Sum services' >> beam.Map(lambda (services, amount):
                                           (service, sum(amount)))
              | 'WriteToBigQuery2' >> beam.io.WriteToBigQuery(
                  'my-gce-project1:china.services',
                  schema=schema2,
                  write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    result = p.run()
    result.wait_until_finish()
def main():
    # bq_source = BigQuerySource(query="""
    #                            SELECT created_at, text
    #                            FROM got_sentiment.got_tweets
    #                            """,
    #                            validate=False, coder=None,
    #                            use_standard_sql=True, flatten_results=True,
    #                            kms_key=None)

    # Removed attributes from ReadFromPubSub:
    #                              with_attributes=False,
    #                             timestamp_attribute='created_at'

    # Create the Pipeline with the specified options.
    with Pipeline(options=options) as p:
        results = (
            p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC)
            | 'Window' >> WindowInto(window.FixedWindows(60))
            | 'Emit_needed_values' >> FlatMap(emit_values, entity_map)
            | 'Combine' >> CombinePerKey(EntityScoreCombine())
            | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn())
            | 'FormatForWrite' >> Map(format_for_write)
            | 'Write' >> WriteToBigQuery('streaming_scores',
                                         dataset=BQ_DATASET,
                                         project=PROJECT_ID,
                                         create_disposition='CREATE_IF_NEEDED',
                                         write_disposition='WRITE_APPEND',
                                         batch_size=20))
Exemple #6
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""


  pubsubTopicName = "projects/data-qe-da7e1252/topics/sk-firewall-json"

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      # CHANGE 1/5: The Google Cloud Storage path is required
                      # for outputting the results.
                      #default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                      #default="/Users/skanabargi/python/stream/output",
                      default='gs://data-qe-da7e1252/tmp/sk_out',
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_args.extend([
      # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
      # run your pipeline on the Google Cloud Dataflow Service.
      '--runner=DataflowRunner',
      # CHANGE 3/5: Your project ID is required in order to run your pipeline on
      # the Google Cloud Dataflow Service.
      '--project=data-qe-da7e1252',
      # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
      # files.
      #'--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
      '--staging_location=gs://data-qe-da7e1252/tmp/stage/',
      # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
      # files.
      #'--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
      '--temp_location=gs://data-qe-da7e1252/tmp/local',
      '--experiments=allow_non_updatable_job',
      '--job_name=sk-pubsub-to-gcs-5',
  ])

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file[pattern] into a PCollection.
    #lines = p | ReadFromText(known_args.input)
    lines = p | beam.io.ReadFromPubSub(topic=pubsubTopicName)



    # Count the occurrences of each word.
    output = ( lines | 'window' >> beam.WindowInto(window.FixedWindows(60)))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'writeTOGcs' >> WriteToText(known_args.output)
def SportTrackerMotivation(input, shortDuration, longDuration):

    boxed = input | "ComputeMetrics" >> ComputeBoxedMetrics(shortDuration)
    shortAverage = (
        boxed
        | "shortWindow" >> beam.WindowInto(window.FixedWindows(shortDuration))
        | "shortAverage" >> CalculateAveragePace())
    longAverage = (
        boxed
        | "longWindow" >> beam.WindowInto(
            window.SlidingWindows(longDuration, shortDuration))
        | "longAverage" >> CalculateAveragePace()
        |
        "longIntoFixed" >> beam.WindowInto(window.FixedWindows(shortDuration)))
    return ((shortAverage, longAverage)
            | beam.CoGroupByKey()
            | beam.FlatMap(asMotivation))
Exemple #8
0
 def expand(self, pcoll):
     return (
         pcoll
         # Assigns window info to each Pub/Sub mesage based on its
         # publish timestamp
         | "Window into Fixed Intervals" >> beam.WindowInto(
             window.FixedWindows(self.window_size))
         | "Add timestamps to messages" >> beam.ParDo(AddTimestamps()))
 def expand(self, pcoll):
     return (pcoll
             | 'TopPerMonthWindow' >> beam.WindowInto(
                 window.FixedWindows(size=THIRTY_DAYS_IN_SECONDS))
             | 'Top' >> combiners.core.CombineGlobally(
                 combiners.TopCombineFn(
                     10, lambda first, second: first[1] < second[1])).
             without_defaults())
Exemple #10
0
def examples_wordcount_streaming(argv):
  import apache_beam as beam
  from apache_beam import window
  from apache_beam.options.pipeline_options import PipelineOptions
  from apache_beam.options.pipeline_options import StandardOptions

  # Parse out arguments.
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic',
      required=True,
      help=(
          'Output PubSub topic of the form '
          '"projects/<PROJECT>/topic/<TOPIC>".'))
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument(
      '--input_topic',
      help=(
          'Input PubSub topic of the form '
          '"projects/<PROJECT>/topics/<TOPIC>".'))
  group.add_argument(
      '--input_subscription',
      help=(
          'Input PubSub subscription of the form '
          '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(StandardOptions).streaming = True

  with TestPipeline(options=pipeline_options) as p:
    # [START example_wordcount_streaming_read]
    # Read from Pub/Sub into a PCollection.
    if known_args.input_subscription:
      lines = p | beam.io.ReadFromPubSub(
          subscription=known_args.input_subscription)
    else:
      lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)
    # [END example_wordcount_streaming_read]

    output = (
        lines
        | 'DecodeUnicode' >>
        beam.FlatMap(lambda encoded: encoded.decode('utf-8'))
        | 'ExtractWords' >>
        beam.FlatMap(lambda x: __import__('re').findall(r'[A-Za-z\']+', x))
        | 'PairWithOnes' >> beam.Map(lambda x: (x, 1))
        | beam.WindowInto(window.FixedWindows(15, 0))
        | 'Group' >> beam.GroupByKey()
        |
        'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))
        |
        'Format' >> beam.Map(lambda word_and_count: '%s: %d' % word_and_count))

    # [START example_wordcount_streaming_write]
    # Write to Pub/Sub
    output | beam.io.WriteStringsToPubSub(known_args.output_topic)
Exemple #11
0
def run(argv=None):
    known_args, pipeline_args = _parse_user_args(argv)

    options = get_pipeline_options(pipeline_args)

    # Load schema
    schema = '{"fields": ' + open(known_args.schema_path, "r").read() + '}'

    schema = parse_table_schema_from_json(schema)

    with beam.Pipeline(options=options) as p:
        # Get message from pubsub and split it by identifier
        formated_messages = (
            p
            | "Read from PubSub" >> beam.io.ReadFromPubSub(known_args.topic)
            | "Windowing" >> beam.WindowInto(window.FixedWindows(30))
            | "Decoder" >> beam.Map(lambda e: e.decode())
            | "Split into List" >> beam.ParDo(SplitWords(",")))

        # Pipeline split:
        # 1. Write to FS
        # 2. Snooze for 10 sec, and change data locally

        # Write to FS
        writer_messages = (
            formated_messages
            | "Write to FS" >> beam.ParDo(WriteToFS())
            | "Get FS keys" >> beam.Map(lambda val: (val["uniqe_id"], val)))

        # Snooze for 10 sec, and change data locally
        do_something_that_takes_time = (
            formated_messages
            | "Snooze For 10 Seconds" >> beam.ParDo(Snooze())
            | "Add Data" >> beam.ParDo(ChangeData("changed!"))
            |
            "Get Update keys" >> beam.Map(lambda val: (val["uniqe_id"], val)))

        # Pipeline group by id and update data in FS after changed locally
        results = ((writer_messages, do_something_that_takes_time)
                   | "Group by key" >> beam.CoGroupByKey()
                   | "Update FS" >> beam.ParDo(UpdateToFS()))

        # Write updated data to Big Query
        (results
         | "Read Document From FS" >> beam.ParDo(ReadFromFS())
         | "Format For BQ" >> beam.ParDo(FormatForBQ())
         | "Write to BigQuery" >> beam.io.WriteToBigQuery("saar.messaging",
                                                          schema=schema))
Exemple #12
0
def side_input_slow_update(
    src_file_pattern,
    first_timestamp,
    last_timestamp,
    interval,
    sample_main_input_elements,
    main_input_windowing_interval):
  # [START SideInputSlowUpdateSnip1]
  from apache_beam.transforms.periodicsequence import PeriodicImpulse
  from apache_beam.transforms.window import TimestampedValue
  from apache_beam.transforms import window

  # from apache_beam.utils.timestamp import MAX_TIMESTAMP
  # last_timestamp = MAX_TIMESTAMP to go on indefninitely

  # Any user-defined function.
  # cross join is used as an example.
  def cross_join(left, rights):
    for x in rights:
      yield (left, x)

  # Create pipeline.
  pipeline_options = PipelineOptions()
  p = beam.Pipeline(options=pipeline_options)
  side_input = (
      p
      | 'PeriodicImpulse' >> PeriodicImpulse(
          first_timestamp, last_timestamp, interval, True)
      | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x))
      | 'ReadFromFile' >> beam.io.ReadAllFromText())

  main_input = (
      p
      | 'MpImpulse' >> beam.Create(sample_main_input_elements)
      |
      'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src))
      | 'WindowMpInto' >> beam.WindowInto(
          window.FixedWindows(main_input_windowing_interval)))

  result = (
      main_input
      | 'ApplyCrossJoin' >> beam.FlatMap(
          cross_join, rights=beam.pvalue.AsIter(side_input)))
  # [END SideInputSlowUpdateSnip1]

  return p, result
Exemple #13
0
 def test_setting_fixed_windows(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_fixed_windows]
     from apache_beam import window
     fixed_windowed_items = (
         items | 'window' >> beam.WindowInto(window.FixedWindows(60)))
     # [END setting_fixed_windows]
     summed = (fixed_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed, beam.equal_to([110, 215, 120]))
     p.run()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic',
                        dest='topic',
                        default=default_topic)
    parser.add_argument('--bucket',
                        dest='bucket',
                        default=default_bucket)

    class WriteToSeparateFiles(beam.DoFn):
        def __init__(self, outdir):
            self.outdir = outdir

        def process(self, element):
            now = datetime.now()
            writer = filesystems.FileSystems.create(
                path=self.outdir + '{}/{}/{}/{}:{}-report.json'.format(now.year, now.month, now.day, now.hour,
                                                                       now.minute))
            writer.write(element)
            writer.close()

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job'])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    def string_join(elements):
        string = str(elements)
        return string.replace('},', '};')

    with beam.Pipeline(options=pipeline_options) as p:
        (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic)
         | "Window" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH))
         | "Combine" >> beam.CombineGlobally(string_join).without_defaults()
         | "WriteToGCSwithDate" >> beam.ParDo(WriteToSeparateFiles(known_args.bucket)))
def main(argv=None):
    def json_parser(x):
        parsed = json.loads(x)
        return parsed

    def bye(x):
        logging.info('outing: %s', x)
        return x

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_topic")
    parser.add_argument("--output_topic")
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    data = (p
            | 'ReadData' >>
            beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes)
            | "JSONParse" >> beam.Map(json_parser))

    (data
     | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"])
     | "Windowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))),
         accumulation_mode=tr.AccumulationMode.DISCARDING,
         allowed_lateness=0)
     | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye' >> beam.Map(bye)
     | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "SlidWindowing" >> beam.WindowInto(
         window.FixedWindows(60),
         trigger=(tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))),
                                    late=tr.Repeatedly(tr.AfterCount(1)))),
         allowed_lateness=300,
         accumulation_mode=tr.AccumulationMode.ACCUMULATING)
     | "Extract" >> beam.Map(lambda x: x["meter_increment"])
     | "Sum_up" >> beam.CombineGlobally(sum).without_defaults()
     | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x})
     | "Enrich with time data" >> beam.ParDo(Enrich())
     | "ToBytesCount" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye2' >> beam.Map(bye)
     | "WriteCount" >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"])
     | "SessionWindowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))),
         accumulation_mode=tr.AccumulationMode.ACCUMULATING,
         allowed_lateness=0)
     | "GroupInPickup" >> beam.CombinePerKey(PickupFn())
     | "Discarding Key" >> beam.Map(lambda x: x[1])
     | "Filter not pickup" >>
     beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None)
     | "ToBytesPickup" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye3' >> beam.Map(bye)
     | "WritePickup" >> beam.io.WriteToPubSub(TOPIC))

    result = p.run()
    result.wait_until_finish()
Exemple #16
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument('--subscription',
                        dest='subscription',
                        default='projects/ilan-uzan/subscriptions/test',
                        help='Input Pub/Sub subscription')

    parser.add_argument('--table_spec ',
                        dest='table_spec',
                        default='ilan-uzan:test.count_and_mean',
                        help='Destination BigQuery table.')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(StandardOptions).streaming = True

    def within_limit(x, limit):
        return x['duration'] <= limit

    class CountAndMeanFn(beam.CombineFn):
        def create_accumulator(self):
            return 0.0, 0

        def add_input(self, sum_count, input):
            (sum, count) = sum_count
            return sum + input['duration'], count + 1

        def merge_accumulators(self, accumulators):
            sums, counts = zip(*accumulators)
            return sum(sums), sum(counts)

        def extract_output(self, sum_count):
            (sum, count) = sum_count

            return {
                'processing_time':
                datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
                'count':
                count,
                'mean':
                sum / count if count else float('NaN')
            }

    with beam.Pipeline(options=pipeline_options) as p:
        table_schema = {
            'fields': [{
                'name': 'processing_time',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'mean',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }]
        }

        (p
         | 'Read from pubsub' >>
         beam.io.ReadFromPubSub(subscription=known_args.subscription)
         | 'To Json' >> beam.Map(lambda e: json.loads(e.decode('utf-8')))
         | 'Filter' >> beam.Filter(within_limit, 100)
         | 'Window' >> beam.WindowInto(window.FixedWindows(60))
         | 'Calculate Metrics' >> beam.CombineGlobally(
             CountAndMeanFn()).without_defaults()
         | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
             known_args.table_spec,
             schema=table_schema,
             method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
             triggering_frequency=1,
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
import csv
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam import window
from datetime import datetime

file_in = 'tags.csv'
skip_head = "userId,movieId,tag,timestamp"


class ParseNewMovies(beam.DoFn):
    def process(self, element):
        if (element != skip_head):
            z = element.split(",")
            y = int(z[3])
            i = datetime.utcfromtimestamp(y)
            x = i.strftime('%Y-%m-%d %H:%M:%S')
            yield {
                'userId': z[0],
                'movieID': z[1],
                'tag': z[2],
                'timestamp': x
            }


with beam.Pipeline() as pipeline:
    item = (pipeline
            | 'Read lines' >> beam.io.ReadFromText(file_in)
            | 'Par D1' >> beam.ParDo(ParseNewMovies()))
    x = (item | 'Par D3' >> beam.WindowInto(window.FixedWindows(5))
         | 'Par D4' >> beam.Map(print))
    profit = (int(sell_price) - int(buy_rate)) * products_count
    elements.append(str(profit))
    return elements


pubsub_data = (
    p
    | 'Read from pub sub' >> beam.io.ReadFromPubSub(
        subscription=input_subscription, timestamp_attribute=1553578219)
    # STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219/r/n
    | 'Remove extra chars' >> beam.Map(lambda data: (data.rstrip().lstrip(
    )))  # STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219
    | 'Split Row' >> beam.Map(lambda row: row.split(
        ','))  # [STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219]
    | 'Filter By Country' >>
    beam.Filter(lambda elements:
                (elements[1] == "Mumbai" or elements[1] == "Bangalore"))
    | 'Create Profit Column' >> beam.Map(
        calculateProfit
    )  # [STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219,27]
    | 'Form Key Value pair' >> beam.Map(
        lambda elements: (elements[0], int(elements[7])))  # STR_2 27
    | 'Window' >> beam.WindowInto(window.FixedWindows(20))
    | 'Sum values' >> beam.CombinePerKey(sum)
    | 'Encode to byte string' >> beam.Map(
        encode_byte_string)  #Pubsub takes data in form of byte strings 
    | 'Write to pus sub' >> beam.io.WriteToPubSub(output_topic))

result = p.run()
result.wait_until_finish()
Exemple #19
0
        return [(Store_id, Store_location, Product_id, Product_category,
                 sold_unit, buy_rate, sell_price, profit, transaction_date)]


#############Create Pipeline ###########
stream_data = (
    p
    | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=inputs_pattern)
    |
    'Remove space in the Data ' >> beam.Map(lambda row: row.lstrip().rstrip())
    | 'Split Data ' >> beam.Map(lambda row: row.decode().split(','))
    | 'Calculate Profit' >> beam.Map(calculateProfit)
    | 'Apply custom timestamp' >> beam.Map(custom_timestamp)
    | 'Make Key value' >> beam.Map(lambda row: (row[:-2], row[-1]))
    | 'Set Fixed Window of 30 sec' >> beam.WindowInto(
        window.FixedWindows(30),
        trigger=Repeatedly(AfterAny(AfterCount(5), AfterProcessingTime(10))),
        accumulation_mode=AccumulationMode.DISCARDING)
    | 'Combine Result of 30 Sec' >> beam.CombinePerKey(sum)
    | 'Format result and append time' >> beam.ParDo(BuildRecordFn())
    | 'Prepare data for BigQuery' >> beam.Map(covert_to_dict)
    #|'Write to Text'>>beam.io.WriteToText(outputs_prefix)
    | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        table='sales', dataset='beam', project='beam-290211'))

p.run().wait_until_finish()

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    run()
Exemple #20
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""


  pubsubTopicName = "projects/data-qe-da7e1252/topics/sk-firewall-json"
  bigqueryTableID = "data-qe-da7e1252:dataflow_to_bigquery.emp"
  outputTable = "data-qe-da7e1252:dataflow_to_bigquery.emp"
#  gcsfile = "gs://data-qe-da7e1252/tmp/sanjeev/source/emp.parquet";
 # gcsfile = "gs://data-qe-da7e1252/tmp/sanjeev/source/parquet/emp*";
  gcsfile = "/Users/skanabargi/dataSource/sample/emp.parquet";



  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      default='gs://data-qe-da7e1252/tmp/sk_out',
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_args.extend([
     # '--runner=DataflowRunner',
      '--project=data-qe-da7e1252',
      '--staging_location=gs://data-qe-da7e1252/tmp/stage/',
      '--temp_location=gs://data-qe-da7e1252/tmp/local',
      '--experiments=allow_non_updatable_job',
      '--job_name=sk-pubsub-to-gcs-10',
      '--streaming'
  ])

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  #pipeline_options.view_as(StandardOptions).streaming = True
  with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file[pattern] into a PCollection.
    #lines = p | ReadFromText(known_args.input)
    #p | "Read parque file" >> beam.io.ReadFromParquet(gcsfile, validate=True) | "SK_COLLECT " >> beam.WindowInto(window.FixedWindows(60*5)) | "Write data " >> WriteToText("gs://data-qe-da7e1252/tmp/sk_out")
    p | "Read parque file" >> beam.io.ReadFromParquet(gcsfile, validate=True) | "windowing" >> beam.WindowInto(window.FixedWindows(60*5)) | "Write data " >> WriteToText("gs://data-qe-da7e1252/tmp/sk_out")
Exemple #21
0
import apache_beam as beam
from apache_beam import window

#import pipeline options.
from apache_beam.options.pipeline_options import  PipelineOptions

#Set log level to info
root = logging.getLogger()
root.setLevel(logging.INFO)

#Create a pipeline,
plOps = beam.Pipeline(options=PipelineOptions())


transactions= ( plOps 
                | 'Read from pubsub'  
                  >>  beam.io.ReadFromPubSub(subscription='projects/beam-project-241218/subscriptions/test-subscription')
                | 'Create Window'
                 >> beam.WindowInto(window.FixedWindows(5))
                | 'Counting Lines ' 
                 >> beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults()
                )

( transactions | 'Print transactions'
                >> beam.ParDo( lambda(s): logging.info('Transactions in window = %s' ,s))
                )
# Run the pipeline
result = plOps.run()
#  wait until pipeline processing is complete
result.wait_until_finish()
#import apache beam library
import apache_beam as beam
from apache_beam import window

#import pipeline options.
from apache_beam.options.pipeline_options import PipelineOptions

#Set log level to info
root = logging.getLogger()
root.setLevel(logging.INFO)

#Create a pipeline,
plOps = beam.Pipeline(options=PipelineOptions())

transactions = (
    plOps
    | 'Read from pubsub' >> beam.io.ReadFromPubSub(
        subscription=
        'projects/universal-code-210021/subscriptions/test-subscription')
    | 'Create Window' >> beam.WindowInto(window.FixedWindows(5))
    | 'Counting Lines ' >> beam.CombineGlobally(
        beam.combiners.CountCombineFn()).without_defaults())

(transactions | 'Print transactions' >>
 beam.ParDo(lambda (s): logging.info('Transactions in window = %s', s)))
# Run the pipeline
result = plOps.run()
#  wait until pipeline processing is complete
result.wait_until_finish()