def run(argv=None, comments=None): """Run the beam pipeline. Args: argv: (optional) the command line flags to parse. comments_collection: (optional) a list of comment JSON objects to process. Used in unit-tests to avoid requiring a BigQuery source. """ args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if comments is not None: comments = p | ("Read in-memory comments") >> beam.Create(comments) else: comments = p | ("Read " + args.reddit_table) >> Read( BigQuerySource(args.reddit_table)) comments |= ( "Normalise comments" >> beam.Map( partial(normalise_comment, max_length=args.max_length))) thread_id_to_comments = comments | ( "Key by thread id" >> beam.Map( lambda comment: (comment.thread_id, comment))) threads = thread_id_to_comments | ( "Group comments by thread ID" >> beam.GroupByKey()) threads = threads | ("Get threads" >> beam.Map(lambda t: t[1])) examples = threads | ( "Create {} examples".format(args.dataset_format) >> beam.FlatMap( partial(create_examples, parent_depth=args.parent_depth, min_length=args.min_length, format=args.dataset_format, ))) examples = _shuffle(examples) # [START dataflow_molecules_split_to_train_and_eval_datasets] # Split the dataset into a training set and an evaluation set assert 0 < (100 - args.train_split*100) < 100, 'eval_percent must in the range (0-100)' eval_percent = 100 - args.train_split*100 train_dataset, eval_dataset = ( examples | 'Split dataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)) # [END dataflow_molecules_split_to_train_and_eval_datasets] if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps serialized_train_examples = train_dataset | ( "serialize {} examples".format('train') >> beam.Map(serialize_fn)) ( serialized_train_examples | ("write " + 'train') >> write_sink( os.path.join(args.output_dir, 'train'), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) serialized_test_examples = eval_dataset | ( "serialize {} examples".format('valid') >> beam.Map(serialize_fn)) ( serialized_test_examples | ("write " + 'valid') >> write_sink( os.path.join(args.output_dir, 'valid'), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
def test_create_application_client(self): pipeline_options = PipelineOptions() apiclient.DataflowApplicationClient(pipeline_options)
def test_interpreter_version_check_passes_with_experiment(self): pipeline_options = PipelineOptions( ["--experiment=use_unsupported_python_version"]) apiclient._verify_interpreter_version_is_supported(pipeline_options)
import re import argparse import typing from apache_beam.options.pipeline_options import PipelineOptions parser = argparse.ArgumentParser() parser.add_argument('--input_topic', required=True, help=('Output path to google cloud storage')) parser.add_argument('--output_path', required=True, help=('Output path to google cloud storage')) path_args, pipeline_args = parser.parse_known_args() options = PipelineOptions(pipeline_args) class ParseApacheServerLog(beam.DoFn): @classmethod def process(self, element): HOST = r'^(?P<host>.*?)' SPACE = r'\s' IDENTITY = r'\S+' USER = r'\S+' TIME = r'(?P<time>\[.*?\])' REQUEST = r'\"(?P<request>.*?)\"' STATUS = r'(?P<status>\d{3})' SIZE = r'(?P<size>\S+)' REGEX = HOST + SPACE + IDENTITY + SPACE + USER + SPACE + TIME + SPACE + REQUEST + SPACE + STATUS + SPACE + SIZE + SPACE match = re.search(REGEX, element)
def run(argv=None): """ This funciton parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) data_gen = DataGenerator(bq_schema_filename=data_args.schema_file, input_bq_table=data_args.input_bq_table, hist_bq_table=data_args.hist_bq_table, p_null=data_args.p_null, n_keys=data_args.n_keys, min_date=data_args.min_date, max_date=data_args.max_date, only_pos=data_args.only_pos, max_int=data_args.max_int, max_float=data_args.max_float, float_precision=data_args.float_precision, write_disp=data_args.write_disp, key_skew=data_args.key_skew) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) rows = ( p | 'Read Histogram Table.' >> beam.io.Read( beam.io.BigQuerySource(data_gen.hist_bq_table)) | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)])) if data_args.primary_key_cols: rows |= EnforcePrimaryKeys(data_args.primary_key_col) if data_args.csv_schema_order: (rows | 'Order fields for CSV writing.' >> beam.FlatMap( lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))]) | 'Write to GCS' >> beam.io.textio.WriteToText( file_path_prefix=data_args.output_prefix, file_name_suffix='.csv') ) if data_args.avro_schema_file: fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file) (rows # Need to convert time stamps from strings to timestamp-micros | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc)) | 'Write to Avro.' >> beam.io.avroio.WriteToAvro( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.avro', use_fastavro=True, schema=fastavro_avsc)) if data_args.output_bq_table: (rows | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish()
def del_unwanted_cols(data): """Delete the unwanted columns""" del data['cast'] del data['date_added'] del data['listed_in'] del data['description'] return data if __name__ == '__main__': parser = argparse.ArgumentParser() known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) (p | 'ReadData' >> beam.io.ReadFromText( 'gs://nf-bucket-test/batch/netflix_titles.csv', skip_header_lines=1) | 'SplitData' >> beam.Map(lambda x: x.split(',')) | 'FormatToDict' >> beam.Map( lambda x: { "show_id": x[0], "type": x[1], "title": x[2], "director": x[3], "country": x[4], "release_year": x[5], "rating": x[6], "duration": x[7] })
# If a header is not provided, assume the first line in a file # to be the header. skip_header_lines = 1 if column_names is None else 0 _ = (p | 'ReadData' >> beam.io.textio.ReadFromText( file_pattern=input_path, skip_header_lines=skip_header_lines) | 'DecodeData' >> csv_decoder.DecodeCSV(column_names=column_names) | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input_path', required=True) parser.add_argument('--output_path', required=True) flags, pipeline_args = parser.parse_known_args() pipeline_option = PipelineOptions(flags=pipeline_args) pipeline_option.view_as(SetupOptions).save_main_session = True run_pipeline(flags, pipeline_option)
def test_no_staging_location(self): with self.assertRaises(RuntimeError) as cm: self.stager.stage_job_resources(PipelineOptions(), staging_location=None) self.assertEqual('The staging_location must be specified.', cm.exception.args[0])
def test_with_extra_packages(self): staging_dir = self.make_temp_dir() source_dir = self.make_temp_dir() self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'xyz2.tar'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'whl.whl'), 'nothing') self.create_temp_file( os.path.join(source_dir, stager.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), os.path.join(source_dir, 'xyz2.tar'), os.path.join(source_dir, 'whl.whl'), '/tmp/remote/remote_file.tar.gz' ] remote_copied_files = [] # We can not rely on actual remote file systems paths hence making # '/tmp/remote/' a new remote path. def is_remote_path(path): return path.startswith('/tmp/remote/') def file_copy(from_path, to_path): if is_remote_path(from_path): remote_copied_files.append(from_path) _, from_name = os.path.split(from_path) if os.path.isdir(to_path): to_path = os.path.join(to_path, from_name) self.create_temp_file(to_path, 'nothing') logging.info('Fake copied remote file: %s to %s', from_path, to_path) elif is_remote_path(to_path): logging.info('Faking upload_file(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) with mock.patch( 'apache_beam.runners.portability.stager_test' '.stager.Stager._download_file', staticmethod(file_copy)): with mock.patch( 'apache_beam.runners.portability.stager_test' '.stager.Stager._is_remote_path', staticmethod(is_remote_path)): self.assertEqual([ 'abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'remote_file.tar.gz', stager.EXTRA_PACKAGES_FILE ], self.stager.stage_job_resources( options, staging_location=staging_dir)) with open(os.path.join(staging_dir, stager.EXTRA_PACKAGES_FILE)) as f: self.assertEqual([ 'abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n', 'remote_file.tar.gz\n' ], f.readlines()) self.assertEqual(['/tmp/remote/remote_file.tar.gz'], remote_copied_files)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') parser.add_argument('--subscription', type=str, help='Pub/Sub subscription to read from') parser.add_argument('--dataset', type=str, required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument( '--table_name', type=str, default='game_stats', help='The BigQuery table name. Should not already exist.') parser.add_argument('--fixed_window_duration', type=int, default=60, help='Numeric value of fixed window duration for user ' 'analysis, in minutes') parser.add_argument('--session_gap', type=int, default=5, help='Numeric value of gap between user sessions, ' 'in minutes') parser.add_argument( '--user_activity_window_duration', type=int, default=30, help='Numeric value of fixed window for finding mean of ' 'user session duration, in minutes') args, pipeline_args = parser.parse_known_args(argv) if args.topic is None and args.subscription is None: parser.print_usage() print(sys.argv[0] + ': error: one of --topic or --subscription is required') sys.exit(1) options = PipelineOptions(pipeline_args) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) fixed_window_duration = args.fixed_window_duration * 60 session_gap = args.session_gap * 60 user_activity_window_duration = args.user_activity_window_duration * 60 # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = save_main_session # Enforce that this pipeline is always run in streaming mode options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read game events from Pub/Sub using custom timestamps, which # are extracted from the data elements, and parse the data. if args.subscription: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=args.subscription) else: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=args.topic) raw_events = (scores | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) | 'AddEventTimestamps' >> beam.Map(lambda elem: beam.window.TimestampedValue( elem, elem['timestamp']))) # Extract username/score pairs from the event stream user_events = (raw_events | 'ExtractUserScores' >> beam.Map(lambda elem: (elem['user'], elem['score']))) # Calculate the total score per user over fixed windows, and cumulative # updates for late data spammers_view = ( user_events | 'UserFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. # These might be robots/spammers. | 'CalculateSpammyUsers' >> CalculateSpammyUsers() # Derive a view from the collection of spammer users. It will be used as # a side input in calculating the team score sums, below | 'CreateSpammersView' >> beam.CombineGlobally( beam.combiners.ToDictCombineFn()).as_singleton_view()) # [START filter_and_calc] # Calculate the total score per team over fixed windows, and emit cumulative # updates for late data. Uses the side input derived above --the set of # suspected robots-- to filter out scores from those users from the sum. # Write the results to BigQuery. ( # pylint: disable=expression-not-assigned raw_events | 'WindowIntoFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out the detected spammer users, using the side input derived # above | 'FilterOutSpammers' >> beam.Filter( lambda elem, spammers: elem['user'] not in spammers, spammers_view) # Extract and sum teamname/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('team') # [END filter_and_calc] | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.table_name + '_teams', args.dataset, { 'team': 'STRING', 'total_score': 'INTEGER', 'window_start': 'STRING', 'processing_time': 'STRING', }, options.view_as(GoogleCloudOptions).project)) # [START session_calc] # Detect user sessions-- that is, a burst of activity separated by a gap # from further activity. Find and record the mean session lengths. # This information could help the game designers track the changing user # engagement as their set of game changes. ( # pylint: disable=expression-not-assigned user_events | 'WindowIntoSessions' >> beam.WindowInto( beam.window.Sessions(session_gap), timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW) # For this use, we care only about the existence of the session, not any # particular information aggregated over it, so we can just group by key # and assign a "dummy value" of None. | beam.CombinePerKey(lambda _: None) # Get the duration of the session | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity()) # [END session_calc] # [START rewindow] # Re-window to process groups of session sums according to when the # sessions complete | 'WindowToExtractSessionMean' >> beam.WindowInto( beam.window.FixedWindows(user_activity_window_duration)) # Find the mean session duration in each window | beam.CombineGlobally( beam.combiners.MeanCombineFn()).without_defaults() | 'FormatAvgSessionLength' >> beam.Map(lambda elem: {'mean_duration': float(elem)}) | 'WriteAvgSessionLength' >> WriteToBigQuery( args.table_name + '_sessions', args.dataset, { 'mean_duration': 'FLOAT', }, options.view_as(GoogleCloudOptions).project))
def run(argv=None, save_main_session=True): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read from PubSub into a PCollection. if known_args.input_subscription: messages = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) else: messages = p | beam.io.ReadFromPubSub(topic=known_args.input_topic) lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = ( lines | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn()) | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn')) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(5, 0)) | 'GroupByKey' >> beam.GroupByKey() | 'CountOnes' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = ( counts | 'format' >> beam.Map(format_result) | 'encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)) # Write to PubSub. # pylint: disable=expression-not-assigned output | beam.io.WriteToPubSub(known_args.output_topic) def check_gbk_format(): # A matcher that checks that the output of GBK is of the form word: count. def matcher(elements): # pylint: disable=unused-variable actual_elements_in_window, window = elements for elm in actual_elements_in_window: assert re.match(r'\S+:\s+\d+', elm.decode('utf-8')) is not None return matcher # Check that the format of the output is correct. assert_that(output, check_gbk_format(), use_global_window=False, label='Assert word:count format.') # Check also that elements are ouput in the right window. # This expects exactly 1 occurrence of any subset of the elements # 150, 151, 152, 153, 154 in the window [150, 155) # or exactly 1 occurrence of any subset of the elements # 210, 211, 212, 213, 214 in the window [210, 215). first_window_val = [ '150: 1', '151: 1', '152: 1', '153: 1', '154: 1', ] second_window_val = [ '210: 1', '211: 1', '212: 1', '213: 1', '214: 1', ] expected_window_to_elements = { window.IntervalWindow(150, 155): [x.encode('utf-8') for x in first_window_val], window.IntervalWindow(210, 215): [x.encode('utf-8') for x in second_window_val], } # To pass, publish numbers in [150-155) or [210-215) with no repeats. # To fail, publish a repeated number in the range above range. # For example: '210 213 151 213' assert_that(output, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='Assert correct streaming windowing.')
def run(argv=None, save_main_session=True): """Build and run the pipeline.""" parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument('--output_bigquery', required=True, help='Output BQ table to write results to ' '"PROJECT_ID:DATASET.TABLE"') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. if known_args.input_subscription: messages = (p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) .with_output_types(bytes)) else: messages = (p | beam.io.ReadFromPubSub(topic=known_args.input_topic) .with_output_types(bytes)) decode_messages = messages | 'DecodePubSubMessages' >> beam.Map(lambda x: x.decode('utf-8')) # Get STT data from function for long audio file using asynchronous speech recognition stt_output = decode_messages | 'SpeechToTextOutput' >> beam.Map(stt_output_response) # Parse and enrich stt_output response parse_stt_output = stt_output | 'ParseSpeechToText' >> beam.Map(stt_parse_response) # Get NLP Sentiment and Entity response nlp_output = parse_stt_output | 'NaturalLanguageOutput' >> beam.Map(get_nlp_output) # Write to BigQuery bigquery_table_schema = { "fields": [ { "mode": "NULLABLE", "name": "fileid", "type": "STRING" }, { "mode": "NULLABLE", "name": "filename", "type": "STRING" }, { "mode": "NULLABLE", "name": "callid", "type": "STRING" }, { "mode": "NULLABLE", "name": "date", "type": "TIMESTAMP" }, { "mode": "NULLABLE", "name": "year", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "month", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "day", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "starttime", "type": "STRING" }, { "mode": "NULLABLE", "name": "duration", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "silencesecs", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "sentimentscore", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "magnitude", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "silencepercentage", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "speakeronespeaking", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "speakertwospeaking", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "nlcategory", "type": "STRING" }, { "mode": "NULLABLE", "name": "transcript", "type": "STRING" }, { "fields": [ { "mode": "NULLABLE", "name": "name", "type": "STRING" }, { "mode": "NULLABLE", "name": "type", "type": "STRING" }, { "mode": "NULLABLE", "name": "sentiment", "type": "FLOAT" } ], "mode": "REPEATED", "name": "entities", "type": "RECORD" }, { "fields": [ { "mode": "NULLABLE", "name": "word", "type": "STRING" }, { "mode": "NULLABLE", "name": "startSecs", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "endSecs", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "speakertag", "type": "INTEGER" }, { "mode": "NULLABLE", "name": "confidence", "type": "FLOAT" } ], "mode": "REPEATED", "name": "words", "type": "RECORD" }, { "fields": [ { "mode": "NULLABLE", "name": "sentence", "type": "STRING" }, { "mode": "NULLABLE", "name": "sentiment", "type": "FLOAT" }, { "mode": "NULLABLE", "name": "magnitude", "type": "FLOAT" } ], "mode": "REPEATED", "name": "sentences", "type": "RECORD" } ] } nlp_output | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( known_args.output_bigquery, schema=bigquery_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) p.run()
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--subdomain', type=str, default=config.SUBDOMAIN, help='Sub-domain for Stat API.') parser.add_argument('--api_key', type=str, default=config.API_KEY, help='API key for Stat API.') parser.add_argument( '--date', type=str, default=(dt.date.today() - dt.timedelta(days=config.PRIOR_DAYS)).strftime("%Y-%m-%d"), help='Run date in YYYY-MM-DD format.') parser.add_argument( '--dataset', type=str, default=config.DATASET, help='BigQuery Dataset to write tables to. Must already exist.') parser.add_argument( '--table_name', type=str, default=config.TABLE_NAME, help='The BigQuery table name. Should not already exist.') parser.add_argument('--project', type=str, default=config.PROJECT, help='Your GCS project.') parser.add_argument('--runner', type=str, default="DataflowRunner", help='Type of DataFlow runner.') args, pipeline_args = parser.parse_known_args(argv) # Create and set your PipelineOptions. options = PipelineOptions(pipeline_args) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = args.project google_cloud_options.job_name = ("{0}-{1}".format( args.project, str(dt.datetime.today().strftime("%m%dt%H%M")))) google_cloud_options.staging_location = "gs://{0}/binaries".format( args.project) google_cloud_options.temp_location = "gs://{0}/temp".format(args.project) options.view_as(StandardOptions).runner = args.runner ## Comment this if wanting to run this file directly. options.view_as(SetupOptions).setup_file = "./setup.py" pipeline = beam.Pipeline(options=options) # Read projects from Stat API api = (pipeline | 'create' >> beam.Create(StatAPI(data=args).get_job()) | 'IterProjects' >> beam.ParDo(IterProjects(data=args))) # Iteates Sites in Projects keywords = (api | 'IterSites' >> beam.ParDo(IterSites()) | 'IterKeywords' >> beam.ParDo(IterKeywords())) # Write to bigquery based on specified schema BQ = (keywords | "WriteToBigQuery" >> WriteToBigQuery( args.table_name, args.dataset, STAT_API_SCHEMA)) pipeline.run()
logger = logging.getLogger(__name__) table_spec = bigquery.TableReference(projectId='query-11', datasetId='rpm', tableId='account_id_schema_new') output_spec = bigquery.TableReference(projectId='query-11', datasetId='rpm', tableId='yesyes') dataflow_options = [ '--project=query-11', '--job_name=amaz', '--temp_location=gs://dataflow_s/tmp', '--region=us-central1' ] dataflow_options.append('--staging_location=gs://dataflow_s/stage') options = PipelineOptions(dataflow_options) gcloud_options = options.view_as(GoogleCloudOptions) options.view_as(StandardOptions).runner = "dataflow" table_schema = { 'fields': [ { 'name': 'ACNO', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'FIELD_1', 'type': 'FLOAT', 'mode': 'NULLABLE'
def run(): t0 = now() # parse command line options known_args, beam_args = runtime_args() # BigQuery utility bq_utils = BigQueryUtils() options = PipelineOptions(beam_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: rows = (p | beam.io.ReadFromText(known_args.input, skip_header_lines=1) | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS)) | beam.ParDo(BeamTransformRecords(), date_fmt='%Y-%m-%d', time_fmt='%H%M')) # load the routes table into a lookup dict sql = f"""select airline, src, dest from {known_args.routes_table}""" routes = bq_utils.execute_as_dict(sql, keycols=['airline', 'src', 'dest']) # lookup routes rows, routes_rejects, missing_routes = ( rows | beam.ParDo(BeamLookupRoute(), routes=routes).with_outputs( 'rejects', 'missing_routes', main='main')) # write parquet output files output = (rows | beam.io.WriteToParquet( os.path.join(known_args.output, 'flights'), schema=datamodel_flights_parquet_schema(), file_name_suffix='.parquet')) # write missing routes to another output as CSV output_routes = ( missing_routes | "gbr" >> beam.GroupByKey() # calculate distinct missing routes | "missing_routes_csv" >> beam.Map( lambda e: ','.join(list(e[0])) ) # csv output the key (e[0] of key value tuple) which is (airline,src,dest) | "missing_routes_out" >> beam.io.WriteToText( os.path.join(known_args.output, 'rejects/missing-routes'), file_name_suffix='.csv', header='airline,src,dest')) # alternative: write (simple) newline delimited json output files # a very flexible output file format for bigquery and other big data tools # much slower to write and larger in size than binary formats such as Parquet, ORC, or Avro # but provides flexibility over schema for smaller data files # larger file sizes should use Avro, Parquet, ORC. Avro provides fastest write speeds where # parquet and orc provide faster read performance for analytical queries # output = (rows # | beam.Map(lambda e: {k: v if k != 'flight_date' else v.strftime('%Y-%m-%d') for k, v in e.items()}) # convert flight_date back to string type for json conversion # | beam.Map(lambda e: json.dumps(e)) # json dump row # | beam.io.WriteToText(os.path.join(known_args.output, 'flights'), # file_name_suffix='.json') # ) logger.info("beam pipiline completed.") # create bigquery external table and insert into bq flights table bq_utils.create_external_table(known_args.flights_ext_table, source_uris=os.path.join( known_args.output, "flights*.parquet"), source_format='PARQUET', delete_if_exists=True) # create and replace existing bigquery flights table bq_utils.create_table(known_args.flights_table, schema=datamodel_flights_bigquery_schema(), delete_if_exists=True) # insert into table as select (itas) statement sql = f""" INSERT INTO `{known_args.flights_table}` SELECT a.day_of_week, a.flight_date, a.airline, a.tailnumber, a.flight_number, a.src, a.src_city, a.src_state, a.dest, a.dest_city, a.dest_state, PARSE_TIME('%H:%M:%S', a.departure_time) as departure_time, PARSE_TIME('%H:%M:%S', a.actual_departure_time) as actual_departure_time, a.departure_delay, a.taxi_out, PARSE_TIME('%H:%M:%S', a.wheels_off) as wheels_off, PARSE_TIME('%H:%M:%S', a.wheels_on) as wheels_on, a.taxi_in, PARSE_TIME('%H:%M:%S', a.arrival_time) as arrival_time, PARSE_TIME('%H:%M:%S', a.actual_arrival_time) as actual_arrival_time, a.arrival_delay, a.cancelled, a.cancellation_code, a.flight_time, a.actual_flight_time, a.air_time, a.flights, a.distance, a.airline_delay, a.weather_delay, a.nas_delay, a.security_delay, a.late_aircraft_delay, -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) AS flightDate_airline_flightNumber FROM `{known_args.flights_ext_table}` a """ # insert records form parquet external table into final bq managed flights table r = bq_utils.execute(sql) logger.info(f"total time: {(now() - t0):,.6f} secs")
def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument( '--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument( '--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return ( pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) with TestPipeline() as p: # Use TestPipeline for testing. # [START pipeline_monitoring_execution] ( p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" pubsubTopicName = "projects/data-qe-da7e1252/topics/cfw-data-topic" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. #default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', #default="/Users/skanabargi/python/stream/output", default='gs://data-qe-da7e1252/tmp/sk_out', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DataflowRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=data-qe-da7e1252', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. #'--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', '--staging_location=gs://data-qe-da7e1252/tmp/stage/', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. #'--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--temp_location=gs://data-qe-da7e1252/tmp/local', '--experiments=allow_non_updatable_job', '--job_name=sk-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. #lines = p | ReadFromText(known_args.input) lines = p | beam.io.ReadFromPubSub(topic=pubsubTopicName) print "SK_logs : " + str(lines) # Count the occurrences of each word. counts = (lines | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[A-Za-z\']+', x)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned #output | WriteToText(known_args.output) print "SK_logs : output" + str(output)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'lowercase' >> beam.Map(unicode.lower) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def test_create_application_client(self): pipeline_options = PipelineOptions() apiclient.DataflowApplicationClient( pipeline_options, DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION)
def run(flags, pipeline_args): """Run Apache Beam pipeline to generate TFRecords for Survival Analysis""" options = PipelineOptions(flags=[], **pipeline_args) options.view_as(WorkerOptions).machine_type = flags.machine_type temp_dir = os.path.join(flags.output_dir, 'tmp') runner = 'DataflowRunner' if flags.cloud else 'DirectRunner' files = tf.gfile.Glob(flags.input_dir + "*") if not flags.cloud: files = files[0: 20] # if running locally for testing, process less files logging.warning("Number of files: " + str(len(files))) labels = get_labels_array( "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv" ) with beam.Pipeline(runner, options=options) as p: with tft_beam.Context(temp_dir=temp_dir): input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC)) filenames = (p | 'Create filenames' >> beam.Create(files)) nii = (filenames | 'Read NII' >> beam.Map(read_nii)) nii_with_labels = ( nii | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels))) raw_train, raw_eval, raw_test = ( nii_with_labels | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap( lambda x: x[1]) raw_eval = (raw_eval | 'FlattenEval' >> beam.FlatMap(lambda x: x[1])) raw_test = (raw_test | 'FlattenTest' >> beam.FlatMap(lambda x: x[1])) raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir) dataset_and_metadata, transform_fn = ( (raw_train, input_metadata) | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset( features.preprocess)) transform_fn = ( (raw_train, input_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn( flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Predict', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, input_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': _ = (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(os.path.join( flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) _ = t | write_label >> WriteTFRecord( dataset_type, flags.output_dir, metadata)
parser = argparse.ArgumentParser() parser.add_argument("--input", dest="input", required=True) parser.add_argument( "--output", required=True, help= ("Output BigQuery table for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE." )) app_args, pipeline_args = parser.parse_known_args() input_files = app_args.input output_filename = 'output.txt' options = PipelineOptions() gcloud_options = options.view_as(GoogleCloudOptions) # gcloud_options.project = project_id gcloud_options.job_name = 'import-citybikes' # Dataflow runner runner = os.environ['DATAFLOW_RUNNER'] options.view_as(StandardOptions).runner = runner with apache_beam.Pipeline(options=options) as p: inputs = [] for match in FileSystems.match([input_files]): for file in match.metadata_list: inputs.append(file.path)
def test_display_data(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) dd = DisplayData.create_from(options) hc.assert_that(dd.items, hc.contains_inanyorder(*case['display_data']))
def run(argv=None): """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline that transforms bitcoin transactions""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://beam-avro-test/bitcoin/txns/*', help='Input file(s) to process.') parser.add_argument( '--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument( '--compress', dest='compress', required=False, action='store_true', help='When set, compress the output data') parser.add_argument( '--fastavro', dest='use_fastavro', required=False, action='store_true', help='When set, use fastavro for Avro I/O') opts, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the avro file[pattern] into a PCollection. records = \ p | 'read' >> ReadFromAvro(opts.input, use_fastavro=opts.use_fastavro) measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn()) # pylint: disable=expression-not-assigned measured | 'write' >> \ WriteToAvro( opts.output, schema=SCHEMA, codec=('deflate' if opts.compress else 'null'), use_fastavro=opts.use_fastavro ) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation metrics = result.metrics().query() for counter in metrics['counters']: logging.info("Counter: %s", counter) for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def test_dataflow_job_file(self): options = PipelineOptions(['--dataflow_job_file', 'abc']) self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc') options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
def test_default_ip_configuration(self): pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp']) env = apiclient.Environment([], pipeline_options, '2.0.0', FAKE_PIPELINE_URL) self.assertEqual(env.proto.workerPools[0].ipConfiguration, None)
def test_template_location(self): options = PipelineOptions(['--template_location', 'abc']) self.assertEqual(options.get_all_options()['template_location'], 'abc') options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['template_location'], None)
def test_interpreter_version_check_fails_py27(self): pipeline_options = PipelineOptions([]) self.assertRaises(Exception, apiclient._verify_interpreter_version_is_supported, pipeline_options)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='/tmp/logs/2020-10-02-11-34-19-EA6C5E314B70B157', #default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/6: The Google Cloud Storage path is required # for outputting the results. default='/tmp/logs/output.txt', #default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to # run your pipeline on the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1) # is required in order to run your pipeline on the Google Cloud # Dataflow Service. '--region=SET_REGION_HERE', # CHANGE 5/6: Your Google Cloud Storage path is required for staging local # files. '--staging_location=/tmp/logs/2020-10-02-11-34-19-EA6C5E314B70B157', #gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 6/6: Your Google Cloud Storage path is required for temporary # files. '--temp_location=/tmp', #gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Read the mtext filettern] into a PCollection. lines = p | ReadFromText(known_args.input) # Count the occurrences of each word. def split_me(x): print("--->", x) x = x.replace('"', '') data = x.split(' ') print("--->", data) print("***", data[2]) date = data[2].split('[')[-1] offset = data[3].split(']')[0] valid_data = [ data[1], # bucket_name f"{date} {offset}", data[7], # operation data[8], # Key data[9], # request_uri data[10], # http status data[11], # error_code data[12], # bytes_sent data[13], # object_size data[14], # total_time data[15], # turn_aroundtime data[16], # referrer data[17], # user_agent data[26], # request_header ] return valid_data #x.split(' ') splits = (lines | 'Split' >> beam.Map(split_me)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) #output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned splits | WriteToText(known_args.output)
def test_interpreter_version_check_passes_py38(self): pipeline_options = PipelineOptions([]) apiclient._verify_interpreter_version_is_supported(pipeline_options)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() #1 Replace your hackathon-edem with your project id parser.add_argument('--input_topic', dest='input_topic', #1 Add your project Id and topic name you created # Example projects/versatile-gist-251107/topics/iexCloud', default='projects/hackathon2-luis1201/topics/valenbisi', help='Input file to process.') #2 Replace your hackathon-edem with your project id parser.add_argument('--input_subscription', dest='input_subscription', #3 Add your project Id and Subscription you created you created # Example projects/versatile-gist-251107/subscriptions/quotesConsumer', default='projects/hackathon2-luis1201/subscriptions/streaming', help='Input Subscription') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) #3 Replace your hackathon-edem with your project id google_cloud_options.project = 'hackathon2-luis1201' google_cloud_options.job_name = 'myjob' # Uncomment below and add your bucket if you want to execute on Dataflow #google_cloud_options.staging_location = 'gs://edem-bucket-roberto/binaries' #google_cloud_options.temp_location = 'gs://edem-bucket-roberto/temp' pipeline_options.view_as(StandardOptions).runner = 'DirectRunner' #pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner' pipeline_options.view_as(StandardOptions).streaming = True pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the pubsub messages into a PCollection. biciStations = p | beam.io.ReadFromPubSub(subscription=known_args.input_subscription) # Print messages received biciStations = ( biciStations | beam.ParDo(LocationConcat())) biciStations | 'Print Quote' >> beam.Map(print) # Store messages on elastic biciStations | 'Bici Stations Stored' >> beam.ParDo(IndexDocument()) result = p.run() result.wait_until_finish()