def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--topic', dest='topic', default=default_topic) parser.add_argument('--bucket', dest='bucket', default=default_bucket) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job']) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True class DiffOutputsFn(beam.DoFn): # These tags will be used to tag the outputs of this DoFn. OUTPUT_TAG_BUY = 'buy' OUTPUT_TAG_SELL = 'sell' OUTPUT_TAG_ERROR = 'error' def process(self, element): dictionary = yaml.load(element) dictionary['timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if dictionary['type'] == 'buy': dictionary.pop('type') yield pvalue.TaggedOutput(self.OUTPUT_TAG_BUY, dictionary) elif dictionary['type'] == 'sell': dictionary.pop('type') yield pvalue.TaggedOutput(self.OUTPUT_TAG_SELL, dictionary) else: # we don't drop the key here, since we want to know where the mistake was yield pvalue.TaggedOutput(self.OUTPUT_TAG_ERROR, dictionary) def string_join(elements): string = str(elements) return string.replace('},', '};') with beam.Pipeline(options=pipeline_options) as p: diff_outputs = (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic) | "SplitOutputs" >> beam.ParDo(DiffOutputsFn()).with_outputs( DiffOutputsFn.OUTPUT_TAG_BUY, DiffOutputsFn.OUTPUT_TAG_SELL, DiffOutputsFn.OUTPUT_TAG_ERROR)) buy = (diff_outputs.buy | "WindowBuy" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH)) | "CombineBuy" >> beam.CombineGlobally(string_join).without_defaults() | "WriteToGCSBuy" >> WriteToText(file_path_prefix=known_args.bucket + 'buy/')) sell = (diff_outputs.sell | "WindowSell" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH)) | "CombineSell" >> beam.CombineGlobally(string_join).without_defaults() | "WriteToGCSSell" >> WriteToText(file_path_prefix=known_args.bucket + 'sell/')) # We want to know what 'type' gave the error, so no ParDo here error = (diff_outputs.error | "WindowError" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH)) | "CombineError" >> beam.CombineGlobally(string_join).without_defaults() | "WriteToGCSError" >> WriteToText(file_path_prefix=known_args.bucket + 'error/'))
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project={}'.format(project)]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True class DiffOutputsFn(beam.DoFn): # These tags will be used to tag the outputs of this DoFn. OUTPUT_TAG_CS_BOOKINGS = 'tag_cs_bookings' OUTPUT_TAG_USERS = 'tag_users' from apache_beam import pvalue def process(self, element): #Receives a single element (a line) and produces words and character #counts. if 'vehicle_id' in element: yield pvalue.TaggedOutput(self.OUTPUT_TAG_CS_BOOKINGS, element) else: yield pvalue.TaggedOutput(self.OUTPUT_TAG_USERS, element) def string_join(elements): return str('\n'.join(elements)) with beam.Pipeline(options=pipeline_options) as p: diff_outputs = ( p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=default_topic) | "SplitOutputs" >> beam.ParDo(DiffOutputsFn()).with_outputs( DiffOutputsFn.OUTPUT_TAG_CS_BOOKINGS, DiffOutputsFn.OUTPUT_TAG_USERS)) cs_bookings_p = (diff_outputs.tag_cs_bookings | "Windowing_cs_bookings" >> beam.WindowInto( window.FixedWindows(WINDOW_LENGTH)) | "Combine_cs_bookings" >> beam.CombineGlobally(string_join).without_defaults() | "WriteGCSCommon_cs_bookings" >> WriteToText( file_path_prefix=default_bucket + 'cs_bookings/', file_name_suffix='cs_booking_YYYYMMDDHH')) users_p = (diff_outputs.tag_users | "Windowing_users" >> beam.WindowInto( window.FixedWindows(WINDOW_LENGTH)) | "Combine_users" >> beam.CombineGlobally(string_join).without_defaults() | "WriteGCSCommon_users" >> WriteToText( file_path_prefix=default_bucket + 'users/', file_name_suffix='users_YYYYMMDDHH'))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--topic', dest='topic', default=default_topic) parser.add_argument('--bucket', dest='bucket', default=default_bucket) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job' ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True def add_key(element): # makes the string a dict and returns the key and the rest of the dict parsed_dict = yaml.load(element) key = parsed_dict[GROUP_BY_KEY] parsed_dict.pop(GROUP_BY_KEY) return (key, parsed_dict) with beam.Pipeline(options=pipeline_options) as p: (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic) | "AddKey" >> beam.Map(lambda element: add_key(element)) | "Window" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH)) | "GroupByKey" >> beam.GroupByKey() | "WriteToGCS" >> WriteToText(known_args.bucket))
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) csv_lines = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=TOPIC).with_output_types(bytes) | "Decode" >> beam.Map(lambda x: x.decode('utf-8')) | "Clean Data" >> beam.Map(regex_clean) | 'ParseCSV' >> beam.ParDo(Split())) table1 = (csv_lines | 'WriteToBigQuery1' >> beam.io.WriteToBigQuery( 'my-gce-project1:china.POC_BEAM', schema=schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) table2 = (csv_lines | 'Collect services' >> beam.ParDo(collectservices()) | 'window' >> beam.WindowInto(window.FixedWindows(30)) | 'Sum services' >> beam.Map(lambda (services, amount): (service, sum(amount))) | 'WriteToBigQuery2' >> beam.io.WriteToBigQuery( 'my-gce-project1:china.services', schema=schema2, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) result = p.run() result.wait_until_finish()
def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" pubsubTopicName = "projects/data-qe-da7e1252/topics/sk-firewall-json" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. #default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', #default="/Users/skanabargi/python/stream/output", default='gs://data-qe-da7e1252/tmp/sk_out', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DataflowRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=data-qe-da7e1252', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. #'--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', '--staging_location=gs://data-qe-da7e1252/tmp/stage/', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. #'--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--temp_location=gs://data-qe-da7e1252/tmp/local', '--experiments=allow_non_updatable_job', '--job_name=sk-pubsub-to-gcs-5', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. #lines = p | ReadFromText(known_args.input) lines = p | beam.io.ReadFromPubSub(topic=pubsubTopicName) # Count the occurrences of each word. output = ( lines | 'window' >> beam.WindowInto(window.FixedWindows(60))) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'writeTOGcs' >> WriteToText(known_args.output)
def SportTrackerMotivation(input, shortDuration, longDuration): boxed = input | "ComputeMetrics" >> ComputeBoxedMetrics(shortDuration) shortAverage = ( boxed | "shortWindow" >> beam.WindowInto(window.FixedWindows(shortDuration)) | "shortAverage" >> CalculateAveragePace()) longAverage = ( boxed | "longWindow" >> beam.WindowInto( window.SlidingWindows(longDuration, shortDuration)) | "longAverage" >> CalculateAveragePace() | "longIntoFixed" >> beam.WindowInto(window.FixedWindows(shortDuration))) return ((shortAverage, longAverage) | beam.CoGroupByKey() | beam.FlatMap(asMotivation))
def expand(self, pcoll): return ( pcoll # Assigns window info to each Pub/Sub mesage based on its # publish timestamp | "Window into Fixed Intervals" >> beam.WindowInto( window.FixedWindows(self.window_size)) | "Add timestamps to messages" >> beam.ParDo(AddTimestamps()))
def expand(self, pcoll): return (pcoll | 'TopPerMonthWindow' >> beam.WindowInto( window.FixedWindows(size=THIRTY_DAYS_IN_SECONDS)) | 'Top' >> combiners.core.CombineGlobally( combiners.TopCombineFn( 10, lambda first, second: first[1] < second[1])). without_defaults())
def examples_wordcount_streaming(argv): import apache_beam as beam from apache_beam import window from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import StandardOptions # Parse out arguments. parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', required=True, help=( 'Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--input_topic', help=( 'Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=( 'Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: # [START example_wordcount_streaming_read] # Read from Pub/Sub into a PCollection. if known_args.input_subscription: lines = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) else: lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic) # [END example_wordcount_streaming_read] output = ( lines | 'DecodeUnicode' >> beam.FlatMap(lambda encoded: encoded.decode('utf-8')) | 'ExtractWords' >> beam.FlatMap(lambda x: __import__('re').findall(r'[A-Za-z\']+', x)) | 'PairWithOnes' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'Group' >> beam.GroupByKey() | 'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))) | 'Format' >> beam.Map(lambda word_and_count: '%s: %d' % word_and_count)) # [START example_wordcount_streaming_write] # Write to Pub/Sub output | beam.io.WriteStringsToPubSub(known_args.output_topic)
def run(argv=None): known_args, pipeline_args = _parse_user_args(argv) options = get_pipeline_options(pipeline_args) # Load schema schema = '{"fields": ' + open(known_args.schema_path, "r").read() + '}' schema = parse_table_schema_from_json(schema) with beam.Pipeline(options=options) as p: # Get message from pubsub and split it by identifier formated_messages = ( p | "Read from PubSub" >> beam.io.ReadFromPubSub(known_args.topic) | "Windowing" >> beam.WindowInto(window.FixedWindows(30)) | "Decoder" >> beam.Map(lambda e: e.decode()) | "Split into List" >> beam.ParDo(SplitWords(","))) # Pipeline split: # 1. Write to FS # 2. Snooze for 10 sec, and change data locally # Write to FS writer_messages = ( formated_messages | "Write to FS" >> beam.ParDo(WriteToFS()) | "Get FS keys" >> beam.Map(lambda val: (val["uniqe_id"], val))) # Snooze for 10 sec, and change data locally do_something_that_takes_time = ( formated_messages | "Snooze For 10 Seconds" >> beam.ParDo(Snooze()) | "Add Data" >> beam.ParDo(ChangeData("changed!")) | "Get Update keys" >> beam.Map(lambda val: (val["uniqe_id"], val))) # Pipeline group by id and update data in FS after changed locally results = ((writer_messages, do_something_that_takes_time) | "Group by key" >> beam.CoGroupByKey() | "Update FS" >> beam.ParDo(UpdateToFS())) # Write updated data to Big Query (results | "Read Document From FS" >> beam.ParDo(ReadFromFS()) | "Format For BQ" >> beam.ParDo(FormatForBQ()) | "Write to BigQuery" >> beam.io.WriteToBigQuery("saar.messaging", schema=schema))
def side_input_slow_update( src_file_pattern, first_timestamp, last_timestamp, interval, sample_main_input_elements, main_input_windowing_interval): # [START SideInputSlowUpdateSnip1] from apache_beam.transforms.periodicsequence import PeriodicImpulse from apache_beam.transforms.window import TimestampedValue from apache_beam.transforms import window # from apache_beam.utils.timestamp import MAX_TIMESTAMP # last_timestamp = MAX_TIMESTAMP to go on indefninitely # Any user-defined function. # cross join is used as an example. def cross_join(left, rights): for x in rights: yield (left, x) # Create pipeline. pipeline_options = PipelineOptions() p = beam.Pipeline(options=pipeline_options) side_input = ( p | 'PeriodicImpulse' >> PeriodicImpulse( first_timestamp, last_timestamp, interval, True) | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x)) | 'ReadFromFile' >> beam.io.ReadAllFromText()) main_input = ( p | 'MpImpulse' >> beam.Create(sample_main_input_elements) | 'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src)) | 'WindowMpInto' >> beam.WindowInto( window.FixedWindows(main_input_windowing_interval))) result = ( main_input | 'ApplyCrossJoin' >> beam.FlatMap( cross_join, rights=beam.pvalue.AsIter(side_input))) # [END SideInputSlowUpdateSnip1] return p, result
def test_setting_fixed_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_fixed_windows] from apache_beam import window fixed_windowed_items = ( items | 'window' >> beam.WindowInto(window.FixedWindows(60))) # [END setting_fixed_windows] summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([110, 215, 120])) p.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--topic', dest='topic', default=default_topic) parser.add_argument('--bucket', dest='bucket', default=default_bucket) class WriteToSeparateFiles(beam.DoFn): def __init__(self, outdir): self.outdir = outdir def process(self, element): now = datetime.now() writer = filesystems.FileSystems.create( path=self.outdir + '{}/{}/{}/{}:{}-report.json'.format(now.year, now.month, now.day, now.hour, now.minute)) writer.write(element) writer.close() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job']) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True def string_join(elements): string = str(elements) return string.replace('},', '};') with beam.Pipeline(options=pipeline_options) as p: (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic) | "Window" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH)) | "Combine" >> beam.CombineGlobally(string_join).without_defaults() | "WriteToGCSwithDate" >> beam.ParDo(WriteToSeparateFiles(known_args.bucket)))
def main(argv=None): def json_parser(x): parsed = json.loads(x) return parsed def bye(x): logging.info('outing: %s', x) return x parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output_topic") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) data = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes) | "JSONParse" >> beam.Map(json_parser)) (data | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"]) | "Windowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))), accumulation_mode=tr.AccumulationMode.DISCARDING, allowed_lateness=0) | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye' >> beam.Map(bye) | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC)) (data | "SlidWindowing" >> beam.WindowInto( window.FixedWindows(60), trigger=(tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))), late=tr.Repeatedly(tr.AfterCount(1)))), allowed_lateness=300, accumulation_mode=tr.AccumulationMode.ACCUMULATING) | "Extract" >> beam.Map(lambda x: x["meter_increment"]) | "Sum_up" >> beam.CombineGlobally(sum).without_defaults() | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x}) | "Enrich with time data" >> beam.ParDo(Enrich()) | "ToBytesCount" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye2' >> beam.Map(bye) | "WriteCount" >> beam.io.WriteToPubSub(TOPIC)) (data | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"]) | "SessionWindowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))), accumulation_mode=tr.AccumulationMode.ACCUMULATING, allowed_lateness=0) | "GroupInPickup" >> beam.CombinePerKey(PickupFn()) | "Discarding Key" >> beam.Map(lambda x: x[1]) | "Filter not pickup" >> beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None) | "ToBytesPickup" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye3' >> beam.Map(bye) | "WritePickup" >> beam.io.WriteToPubSub(TOPIC)) result = p.run() result.wait_until_finish()
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument('--subscription', dest='subscription', default='projects/ilan-uzan/subscriptions/test', help='Input Pub/Sub subscription') parser.add_argument('--table_spec ', dest='table_spec', default='ilan-uzan:test.count_and_mean', help='Destination BigQuery table.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(StandardOptions).streaming = True def within_limit(x, limit): return x['duration'] <= limit class CountAndMeanFn(beam.CombineFn): def create_accumulator(self): return 0.0, 0 def add_input(self, sum_count, input): (sum, count) = sum_count return sum + input['duration'], count + 1 def merge_accumulators(self, accumulators): sums, counts = zip(*accumulators) return sum(sums), sum(counts) def extract_output(self, sum_count): (sum, count) = sum_count return { 'processing_time': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'count': count, 'mean': sum / count if count else float('NaN') } with beam.Pipeline(options=pipeline_options) as p: table_schema = { 'fields': [{ 'name': 'processing_time', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'mean', 'type': 'FLOAT', 'mode': 'NULLABLE' }] } (p | 'Read from pubsub' >> beam.io.ReadFromPubSub(subscription=known_args.subscription) | 'To Json' >> beam.Map(lambda e: json.loads(e.decode('utf-8'))) | 'Filter' >> beam.Filter(within_limit, 100) | 'Window' >> beam.WindowInto(window.FixedWindows(60)) | 'Calculate Metrics' >> beam.CombineGlobally( CountAndMeanFn()).without_defaults() | 'Write to BigQuery' >> beam.io.WriteToBigQuery( known_args.table_spec, schema=table_schema, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, triggering_frequency=1, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
import csv from apache_beam.options.pipeline_options import PipelineOptions from apache_beam import window from datetime import datetime file_in = 'tags.csv' skip_head = "userId,movieId,tag,timestamp" class ParseNewMovies(beam.DoFn): def process(self, element): if (element != skip_head): z = element.split(",") y = int(z[3]) i = datetime.utcfromtimestamp(y) x = i.strftime('%Y-%m-%d %H:%M:%S') yield { 'userId': z[0], 'movieID': z[1], 'tag': z[2], 'timestamp': x } with beam.Pipeline() as pipeline: item = (pipeline | 'Read lines' >> beam.io.ReadFromText(file_in) | 'Par D1' >> beam.ParDo(ParseNewMovies())) x = (item | 'Par D3' >> beam.WindowInto(window.FixedWindows(5)) | 'Par D4' >> beam.Map(print))
profit = (int(sell_price) - int(buy_rate)) * products_count elements.append(str(profit)) return elements pubsub_data = ( p | 'Read from pub sub' >> beam.io.ReadFromPubSub( subscription=input_subscription, timestamp_attribute=1553578219) # STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219/r/n | 'Remove extra chars' >> beam.Map(lambda data: (data.rstrip().lstrip( ))) # STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219 | 'Split Row' >> beam.Map(lambda row: row.split( ',')) # [STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219] | 'Filter By Country' >> beam.Filter(lambda elements: (elements[1] == "Mumbai" or elements[1] == "Bangalore")) | 'Create Profit Column' >> beam.Map( calculateProfit ) # [STR_2,Mumbai,PR_265,Cosmetics,8,39,66,1553578219,27] | 'Form Key Value pair' >> beam.Map( lambda elements: (elements[0], int(elements[7]))) # STR_2 27 | 'Window' >> beam.WindowInto(window.FixedWindows(20)) | 'Sum values' >> beam.CombinePerKey(sum) | 'Encode to byte string' >> beam.Map( encode_byte_string) #Pubsub takes data in form of byte strings | 'Write to pus sub' >> beam.io.WriteToPubSub(output_topic)) result = p.run() result.wait_until_finish()
return [(Store_id, Store_location, Product_id, Product_category, sold_unit, buy_rate, sell_price, profit, transaction_date)] #############Create Pipeline ########### stream_data = ( p | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=inputs_pattern) | 'Remove space in the Data ' >> beam.Map(lambda row: row.lstrip().rstrip()) | 'Split Data ' >> beam.Map(lambda row: row.decode().split(',')) | 'Calculate Profit' >> beam.Map(calculateProfit) | 'Apply custom timestamp' >> beam.Map(custom_timestamp) | 'Make Key value' >> beam.Map(lambda row: (row[:-2], row[-1])) | 'Set Fixed Window of 30 sec' >> beam.WindowInto( window.FixedWindows(30), trigger=Repeatedly(AfterAny(AfterCount(5), AfterProcessingTime(10))), accumulation_mode=AccumulationMode.DISCARDING) | 'Combine Result of 30 Sec' >> beam.CombinePerKey(sum) | 'Format result and append time' >> beam.ParDo(BuildRecordFn()) | 'Prepare data for BigQuery' >> beam.Map(covert_to_dict) #|'Write to Text'>>beam.io.WriteToText(outputs_prefix) | 'Write to BigQuery' >> beam.io.WriteToBigQuery( table='sales', dataset='beam', project='beam-290211')) p.run().wait_until_finish() if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) run()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" pubsubTopicName = "projects/data-qe-da7e1252/topics/sk-firewall-json" bigqueryTableID = "data-qe-da7e1252:dataflow_to_bigquery.emp" outputTable = "data-qe-da7e1252:dataflow_to_bigquery.emp" # gcsfile = "gs://data-qe-da7e1252/tmp/sanjeev/source/emp.parquet"; # gcsfile = "gs://data-qe-da7e1252/tmp/sanjeev/source/parquet/emp*"; gcsfile = "/Users/skanabargi/dataSource/sample/emp.parquet"; parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', default='gs://data-qe-da7e1252/tmp/sk_out', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # '--runner=DataflowRunner', '--project=data-qe-da7e1252', '--staging_location=gs://data-qe-da7e1252/tmp/stage/', '--temp_location=gs://data-qe-da7e1252/tmp/local', '--experiments=allow_non_updatable_job', '--job_name=sk-pubsub-to-gcs-10', '--streaming' ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True #pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. #lines = p | ReadFromText(known_args.input) #p | "Read parque file" >> beam.io.ReadFromParquet(gcsfile, validate=True) | "SK_COLLECT " >> beam.WindowInto(window.FixedWindows(60*5)) | "Write data " >> WriteToText("gs://data-qe-da7e1252/tmp/sk_out") p | "Read parque file" >> beam.io.ReadFromParquet(gcsfile, validate=True) | "windowing" >> beam.WindowInto(window.FixedWindows(60*5)) | "Write data " >> WriteToText("gs://data-qe-da7e1252/tmp/sk_out")
import apache_beam as beam from apache_beam import window #import pipeline options. from apache_beam.options.pipeline_options import PipelineOptions #Set log level to info root = logging.getLogger() root.setLevel(logging.INFO) #Create a pipeline, plOps = beam.Pipeline(options=PipelineOptions()) transactions= ( plOps | 'Read from pubsub' >> beam.io.ReadFromPubSub(subscription='projects/beam-project-241218/subscriptions/test-subscription') | 'Create Window' >> beam.WindowInto(window.FixedWindows(5)) | 'Counting Lines ' >> beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults() ) ( transactions | 'Print transactions' >> beam.ParDo( lambda(s): logging.info('Transactions in window = %s' ,s)) ) # Run the pipeline result = plOps.run() # wait until pipeline processing is complete result.wait_until_finish()
#import apache beam library import apache_beam as beam from apache_beam import window #import pipeline options. from apache_beam.options.pipeline_options import PipelineOptions #Set log level to info root = logging.getLogger() root.setLevel(logging.INFO) #Create a pipeline, plOps = beam.Pipeline(options=PipelineOptions()) transactions = ( plOps | 'Read from pubsub' >> beam.io.ReadFromPubSub( subscription= 'projects/universal-code-210021/subscriptions/test-subscription') | 'Create Window' >> beam.WindowInto(window.FixedWindows(5)) | 'Counting Lines ' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults()) (transactions | 'Print transactions' >> beam.ParDo(lambda (s): logging.info('Transactions in window = %s', s))) # Run the pipeline result = plOps.run() # wait until pipeline processing is complete result.wait_until_finish()