def run(): import time gcs_path = "gs://marcin-playground/dataflow" pipeline = beam.Pipeline(runner="DataflowRunner", argv=[ "--project", "project-name", "--staging_location", ("%s/staging_location" % gcs_path), "--temp_location", ("%s/temp" % gcs_path), "--output", ("%s/output" % gcs_path), "--setup_file", "./setup.py" ]) (pipeline | "Load" >> ReadFromText("gs://marcin-playground/books/*.txt") | "Count Words" >> CountWordsTransform() | "FormatOutput" >> beam.Map(lambda (word, count): "{0}: {1}".format(word, count)) | "Save" >> WriteToText("{0}/output/wordcount{1}".format( gcs_path, int(time.time())))) pipeline.run()
def run(p, input_file, output_file): #pylint: disable=expression-not-assigned ( p # Read the lines from a text file. | 'Read' >> ReadFromText(input_file) # Split the line into individual words. | 'Split' >> beam.FlatMap(lambda line: re.split(r'\W+', line)) # Map each word to an instance of MyRow. | 'ToRow' >> beam.Map(MyRow).with_output_types(MyRow) # SqlTransform yields a PCollection containing elements with attributes # based on the output of the query. | 'Sql!!' >> SqlTransform( """ SELECT word as key, COUNT(*) as `count` FROM PCOLLECTION GROUP BY word""") | 'Format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count)) | 'Write' >> WriteToText(output_file))
def run(argv=None): """Main entry point; defines and runs the tfidf pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--uris', required=True, help='URIs to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read documents specified by the uris command line option. pcoll = read_documents(p, glob.glob(known_args.uris)) # Compute TF-IDF information for each word. output = pcoll | TfIdf() # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output)
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | 'ReadFromText' >> ReadFromText(known_args.input) results = (lines | 'Split' >> beam.Map(split) | 'Group' >> beam.GroupByKey() | 'FormatResult' >> beam.Map(format_result)) results | 'WriteToText' >> WriteToText(known_args.output)
def run(): # set up location PROJECT_ID = 'trim-cistern-288221' BUCKET = 'gs://bhnk-milestone1-data' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/' # use DataflowRunner instead of DirectRunner options = PipelineOptions( flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='kagglegenre', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # retrieve the data from imdb_refined dataset and save this information (location) sql = 'SELECT * FROM kaggle_refined.Title_genre' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) # use the previously saved information (location) and read from BigQuery # query results is now input P collection query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) # Use ParDo to call function on query results out_pcoll = query_results | 'Split genres' >> beam.ParDo(SplitGenre()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'kaggle_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Title_genre_Dataflow' schema_id = 'Title:STRING,Genre:STRING' # write to BigQuery using the location set above out_pcoll | 'Write to BQ' >> WriteToBigQuery(table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) # run and display results after everything is finished result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='../data/spikey_sales_weekly.txt', # required=True, help='Input File to Process') parser.add_argument( '--output', dest='output', default='../output/products', # required=True, help='output processed file') known_args, pipeline_Args = parser.parse_known_args(argv) pipeline_option = PipelineOptions(argv) pipeline_option.view_as(SetupOptions).save_main_session = True logging.getLogger().setLevel(logging.INFO) with beam.Pipeline(options=pipeline_option) as p: # read input file items = p | ReadFromText(known_args.input) logging.info("Print the items %s", items) split_item_result = ( items | 'Split Result' >> beam.ParDo( SplitItemBasedOnSalesFn()).with_outputs( SplitItemBasedOnSalesFn.OUTPUT_TAG_TOP_SELLER, SplitItemBasedOnSalesFn.OUTPUT_TAG_POOR_SELLER)) top_seller = split_item_result[ SplitItemBasedOnSalesFn.OUTPUT_TAG_TOP_SELLER] poor_seller = split_item_result[ SplitItemBasedOnSalesFn.OUTPUT_TAG_POOR_SELLER] # (top_seller | WriteToText(known_args.output + '_top_seller')) (poor_seller | WriteToText(known_args.output + '_poor_seller'))
def run(argv=None): """Run the workflow.""" parser = argparse.ArgumentParser() parser.add_argument('--output') parser.add_argument('--ignore_corpus', default='') parser.add_argument('--ignore_word', default='') parser.add_argument('--num_groups') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: group_ids = [] for i in range(0, int(known_args.num_groups)): group_ids.append('id' + str(i)) query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare' query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare' ignore_corpus = known_args.ignore_corpus ignore_word = known_args.ignore_word pcoll_corpus = p | 'read corpus' >> beam.io.ReadFromBigQuery( query=query_corpus) pcoll_word = p | 'read_words' >> beam.io.ReadFromBigQuery( query=query_word) pcoll_ignore_corpus = p | 'create_ignore_corpus' >> beam.Create( [ignore_corpus]) pcoll_ignore_word = p | 'create_ignore_word' >> beam.Create( [ignore_word]) pcoll_group_ids = p | 'create groups' >> beam.Create(group_ids) pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word, pcoll_ignore_corpus, pcoll_ignore_word) # pylint:disable=expression-not-assigned pcoll_groups | WriteToText(known_args.output)
def process_entity(self, entity_name, pk): with beam.Pipeline(options=self.pipeline_options) as p: # First set up a stream for the data data = read_file( p, entity_name, self.get_staging('public.{0}'.format(entity_name)) + '*', pk) index = None try: # Also set up a stream for the index index = read_file( p, '{0}index'.format(entity_name), self.get_source_index('entity_{0}*'.format(entity_name)), pk) except IOError: logging.info("Could not open index, maybe doesn't exist") # create an empty pcollection, so we can at least run index = p | beam.Create([]) # Generate business keys, checksum, dv_source, load_dtm preproc_data = data | 'preprocess_' + entity_name >> \ beam.Map(add_cksum) # Group with index to be able to identify new, updated, deleted merge = ({'data': preproc_data, 'index': index}) | \ 'grouped_by_' + pk >> beam.CoGroupByKey() # Extract the modified data out of the records extract = merge \ | 'filter_' + entity_name >> beam.Filter(unchanged_rows) \ | 'extract_' + entity_name >> beam.Map(extract_data) # Write them out to disk in staging extract | 'Write_' + entity_name >> \ WriteToText( self.get_psa_location('public.{0}'.format(entity_name)), coder=JsonCoder())
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='./kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session # The pipeline will be run on exiting the with block. with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) counts = (lines | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[A-Za-z\]+')).with_output_types(unicode)) | 'PairWIthOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word, count): return '%s: %d' % (word, count) output = counts | 'Format' >> beam.MapTuple(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'Write' >> WriteToText(known_args.output)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/data/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/6: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') parser.add_argument( '--pid', dest='pid', help='project id') parser.add_argument( '--mbucket', dest='mbucket', help='model bucket name') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session # The pipeline will be run on exiting the with block. with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. prediction_data = (p | 'CreatePCollection' >> beam.Create([known_args.input]) | 'ReadCSVFle' >> beam.FlatMap(get_csv_reader)) output = (prediction_data | 'Predict' >> beam.ParDo(MyPredictDoFn(project_id=known_args.pid, bucket_name=known_args.mbucket))) output | 'WritePR' >> WriteToText(known_args.output)
def run(): PROJECT_ID = 'starry-center-266501' # change to your project id # creating pipeline through direct runner # Project ID is required when using the BQ source options = { 'project': PROJECT_ID } opts = beam.pipeline.PipelineOptions(flags=[], **options) # Create beam pipeline using local runner p = beam.Pipeline('DirectRunner', options=opts) sql = 'SELECT writers, tConst FROM imdb_modeled.Writes limit 100' # passing a query. Shouldn't process more than 1000 records w DR # directors is an array of strings, tConst is a string bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) # direct runner is not running in parallel on several workers. DR is local query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # read results and assign them to a new p-collection # apply ParDo to split the directors titles # call pardo, pipe query results to pardo split_directors_pcoll = query_results | 'Return title: writer i dictonaries' >> beam.ParDo(SplitWritersFn()) # write PCollection to log file split_directors_pcoll | 'Write log 1' >> WriteToText('formatted_titles_pcoll.txt') dataset_id = 'imdb_modeled' table_id = 'Writes_Beam' schema_id = 'writer:STRING, tConst:STRING' # write PCollection to new BQ table split_directors_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, batch_size=int(100)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='application', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS case_id, * FROM (SELECT emp.employer_id, CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE AS DECISION_DATE, VISA_CLASS FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) AS app JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = app.employer_name AND emp.employer_city = app.employer_city) AS t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Application Transfromation' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_appplication.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Dataflow' schema_id = 'case_id:INTEGER, employer_id:INTEGER, CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECISION_DATE:DATE, VISA_CLASS:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[A-Za-z\']+', x)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(known_args.output, shard_name_template='')
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://loganalysis/error_log.txt', help='Input file to process.') parser.add_argument('--output', dest='output', default='gs://loganalysis/output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DirectRunner', '--project=springmldemoproject', '--staging_location=gs://loganalysis/staging', '--temp_location=gs://loganalysis/temp', '--job_name=log-job', ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input) counts = ( lines | 'window' >> beam.WindowInto(window.GlobalWindows()) | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)', x)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) def format_result(time_count): (time, count) = time_count return '%s: %s' % (time, count) output = counts | 'Format' >> beam.Map(format_result) output | WriteToText(known_args.output)
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='ownership', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) as ownership_id, * FROM (SELECT * FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1)) as t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_ownership.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Dataflow' schema_id = 'ownership_id:INTEGER, occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run_pipeline(source, target): header = get_header(source) fields = header.split(CSV_DELIMITER) (bq_schema, schema) = get_schema(target) input_path = 'gs://dotz-hiring-datalake/raw/{}.csv'.format(source) output_path = 'gs://dotz-hiring-datalake/processed/{}.json/part'.format( target) pipeline_args = [ '--job_name={}-{}'.format(target, str(time.time()).replace('.', '-')), '--input={}'.format(input_path), '--output={}'.format(output_path) ] pipeline_args.extend(BASE_PIPELINE_ARGS) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as pipeline: lines = pipeline | ReadFromText(input_path) # not so bright way to remove a CSV header lines = lines | 'RemoveHeader' >> beam.Filter( lambda line: line != header) objs = lines | 'CSV2JSON' >> beam.Map(csv2json(fields)) proc_objs = objs | 'ProcessJSONs' >> beam.Map(process(schema)) filtered_proc_objs = proc_objs | 'FilterEmpties' >> beam.Filter( lambda x: x) dumped_objs = filtered_proc_objs | 'DumpJSONs' >> beam.Map(json.dumps) dumped_objs | WriteToText(output_path) filtered_proc_objs | WriteToBigQuery( 'dotz-hiring:tubulation.{}'.format(target), write_disposition=BigQueryDisposition.WRITE_TRUNCATE, create_disposition=BigQueryDisposition.CREATE_NEVER)
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='employer', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS employer_id, * FROM (SELECT employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1)) AS t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_employer.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Dataflow' schema_id = 'employer_id:INTEGER, employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Read the text file lines = p | 'Read' >> ReadFromText(known_args.input) counts = ( lines | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'PairWIthOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) def format_result(word, count): return '%s: %d' % (word, count) output = counts | 'Format' >> beam.MapTuple(format_result) #Write the word count in output output | 'Write' >> WriteToText(known_args.output)
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True opt = options.view_as(_Options) inputs = opt.inputs output_prefix = opt.output_prefix or os.path.join( options.view_as(GoogleCloudOptions).temp_location, 'output') shards = opt.shards p = Pipeline(options=options) def generate(n): yield from range(n * _ELEMENTS_PER_INPUT, (n + 1) * _ELEMENTS_PER_INPUT) (p | Create(range(inputs)) | ParDo(generate).with_output_types(int) | WriteToText(output_prefix, num_shards=shards)) p.run()
def examples_wordcount_templated(renames): """Templated WordCount example snippet.""" import re import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io import WriteToText from apache_beam.options.pipeline_options import PipelineOptions # [START example_wordcount_templated] class WordcountTemplatedOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): # Use add_value_provider_argument for arguments to be templatable # Use add_argument as usual for non-templatable arguments parser.add_value_provider_argument( '--input', help='Path of the file to read from') parser.add_argument('--output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(['--output', 'some/output_path']) p = beam.Pipeline(options=pipeline_options) wordcount_options = pipeline_options.view_as(WordcountTemplatedOptions) lines = p | 'Read' >> ReadFromText(wordcount_options.input) # [END example_wordcount_templated] (lines | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | 'PairWithOnes' >> beam.Map(lambda x: (x, 1)) | 'Group' >> beam.GroupByKey() | 'Sum' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'Format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> WriteToText(wordcount_options.output)) p.visit(SnippetUtils.RenameFiles(renames)) result = p.run() result.wait_until_finish()
def run(args=None, save_main_session=True): """Runs the workflow computing total points from a collection of matches.""" if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Register the custom coder for the Player class, so that it will be used in # the computation. coders.registry.register_coder(Player, PlayerCoder) (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) # The get_players function is annotated with a type hint above, so the type # system knows the output type of the following operation is a key-value # pair of a Player and an int. Please see the documentation for details on # types that are inferred automatically as well as other ways to specify # type hints. | beam.Map(get_players) # The output type hint of the previous step is used to infer that the key # type of the following operation is the Player type. Since a custom coder # is registered for the Player class above, a PlayerCoder will be used to # encode Player objects as keys for this combine operation. | beam.CombinePerKey(sum) | beam.Map(lambda k_v: '%s,%d' % (k_v[0].name, k_v[1])) | WriteToText(known_args.output))
def examples_ptransforms_templated(renames): # [START examples_ptransforms_templated] import apache_beam as beam from apache_beam.io import WriteToText from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.value_provider import StaticValueProvider class TemplatedUserOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument('--templated_int', type=int) class MySumFn(beam.DoFn): def __init__(self, templated_int): self.templated_int = templated_int def process(self, an_int): yield self.templated_int.get() + an_int pipeline_options = PipelineOptions() p = beam.Pipeline(options=pipeline_options) user_options = pipeline_options.view_as(TemplatedUserOptions) my_sum_fn = MySumFn(user_options.templated_int) sum = (p | 'ReadCollection' >> beam.io.ReadFromText('gs://some/integer_collection') | 'StringToInt' >> beam.Map(lambda w: int(w)) | 'AddGivenInt' >> beam.ParDo(my_sum_fn) | 'WriteResultingCollection' >> WriteToText('some/output_path')) # [END examples_ptransforms_templated] # Templates are not supported by DirectRunner (only by DataflowRunner) # so a value must be provided at graph-construction time my_sum_fn.templated_int = StaticValueProvider(int, 10) p.visit(SnippetUtils.RenameFiles(renames)) result = p.run() result.wait_until_finish()
def run(argv=None): """Runs the workflow computing total points from a collection of matches.""" parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: (p # pylint: disable=expression-not-assigned | 'read' >> ReadFromText(known_args.input, coder=JsonCoder()) | 'points' >> beam.FlatMap(compute_points) | beam.CombinePerKey(sum) | 'write' >> WriteToText(known_args.output, coder=JsonCoder()))
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 AND length(soc_code) > 5 AND soc_code NOT LIKE '%-%' GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) LIMIT 50" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll_no_dup = query_results | 'Format prevailing_wage_YR and Remove dups' >> beam.ParDo( NoDuplicates()) out_pcoll_fix_date = out_pcoll_no_dup | 'Format Date' >> beam.ParDo( FormatDate()) out_pcoll = out_pcoll_fix_date | 'Format Soc' >> beam.ParDo( FormatSocCode()) out_pcoll | 'Log output' >> WriteToText('output_occ_codeTest.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occ_CodeTest' schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:Date, employment_end_date:Date, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None): import os d = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser() parser.add_argument("--input", dest="input", default=os.path.join(d, "data", "auctions.txt")) parser.add_argument("--output", dest="output", required=True) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ "--runner=DirectRunner", "--temp_location=/tmp/beam_tmp", "--job_name=test-job", ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input) data = ( lines | "Split" >> (beam.Map(lambda x: tuple(x.split("\t"))).with_output_types( beam.typehints.Tuple[str, str])) | "Clean" >> beam.Map(lambda x: (AvgPrice.format_artis(x[0]), int(x[1].strip())))) counts = AvgPrice.computeAvgPerKey(data) def format_result(word_count): (word, count) = word_count return "%s: %s" % (word, count) output = counts | "Format" >> beam.Map(format_result) output | WriteToText(known_args.output)
def run(): # set up location PROJECT_ID = 'trim-cistern-288221' BUCKET = 'gs://bhnk-milestone1-data' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) # executed with DirectRunner p = beam.Pipeline('DirectRunner', options=opts) # retrieve the data from imdb_refined dataset and save this information (location) sql = 'SELECT * FROM imdb_refined.Primary_Professions limit 250' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) # use the previously saved information (location) and read from BigQuery # query results is now input P collection query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) # Use ParDo to call function on query results out_pcoll = query_results | 'Split Primary Professions' >> beam.ParDo( SplitPrimaryProfessions()) out_pcoll | 'Log output' >> WriteToText('output.txt') dataset_id = 'imdb_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Primary_Professions_Beam' schema_id = 'nconst:STRING,primaryProfession:STRING' # write to BigQuery using the location set above out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) # run and display results after everything is finished result = p.run() result.wait_until_finish()
def run(argv=None): # pylint: disable=missing-docstring parser = argparse.ArgumentParser() parser.add_argument('--grid_size', dest='grid_size', default=1000, help='Size of the NxN matrix') parser.add_argument( '--coordinate_output', dest='coordinate_output', required=True, help='Output file to write the color coordinates of the image to.') parser.add_argument('--image_output', dest='image_output', default=None, help='Output file to write the resulting image to.') known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: n = int(known_args.grid_size) coordinates = generate_julia_set_colors(p, complex(-.62772, .42193), n, 100) def x_coord_key(x_y_i): (x, y, i) = x_y_i return (x, (x, y, i)) # Group each coordinate triplet by its x value, then write the coordinates # to the output file with an x-coordinate grouping per line. # pylint: disable=expression-not-assigned (coordinates | 'x coord key' >> beam.Map(x_coord_key) | 'x coord' >> beam.GroupByKey() | 'format' >> beam.Map(lambda k_coords: ' '.join('(%s, %s, %s)' % c for c in k_coords[1])) | WriteToText(known_args.coordinate_output))
def run(): PROJECT_ID = 'cs327e-sp2020' # change to your project id # Project ID is required when using the BQ source options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) # Create beam pipeline using local runner p = beam.Pipeline('DirectRunner', options=opts) sql = 'SELECT * FROM covid_19_modeled.Event' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # format timestamp ts_pcoll = query_results | 'Format Timestamp' >> beam.ParDo( FormatTimestampFn()) # write new PCollection to log file ts_pcoll | 'Write log' >> WriteToText('ts_pcoll.txt') dataset_id = 'covid_19_modeled' table_id = 'Event_Beam' schema_id = '''location_id:INTEGER,last_update:DATETIME, confirmed:INTEGER,deaths:INTEGER,recovered:INTEGER''' # write new PCollection to BQ table ts_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def run(p, input_file, output_file): # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(input_file) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(bytes)) | 'count' >> beam.ExternalTransform( 'beam:transforms:xlang:count', None, EXPANSION_SERVICE_ADDR)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(output_file) result = p.run() result.wait_until_finish()