def main(gcs_path, out, start=None, end=None, pipeline_args=None): steps = [ apache_beam.FlatMap('Parse XML and filter', parse_xml), apache_beam.Map( 'Coerce "wikitext" key to string type', force_string_function('wikitext')), apache_beam.FlatMap('Parse markdown into plaintext', parse_wikitext), apache_beam.Map( 'Coerce "text" key to string type', force_string_function('text')), apache_beam.Map( 'Filter out any vestigial HTML', html_to_text), core.ParDo('batch', BatchFn(10)), apache_beam.FlatMap( 'Entities (batch)', analyze_entities_batch), ] p = apache_beam.Pipeline(argv=pipeline_args) if start: value = p | apache_beam.Read( 'Pick up at step {}'.format(start), apache_beam.io.TextFileSource( gcs_path)) | \ apache_beam.Map('Parse JSON', json.loads) else: value = p | apache_beam.Read( 'Read XML', custom_sources.XmlFileSource('page', gcs_path)) for step in steps[start:end]: value = value | step if end: if not out.startswith('gs://'): raise ValueError('Output must be GCS path if an end is specified.') value = value | apache_beam.Map('to JSON', json.dumps) | \ apache_beam.Write('Dump to GCS', apache_beam.io.TextFileSink(out)) else: value = value | apache_beam.Write( 'Dump metadata to BigQuery', apache_beam.io.BigQuerySink( out, schema=', '.join([ 'article_id:STRING', 'article_title:STRING', 'article_sentiment_polarity:FLOAT', 'article_sentiment_magnitude:FLOAT', 'entity_name:STRING', 'entity_type:STRING', 'entity_wikipedia_url:STRING', 'entity_salience:FLOAT', 'entity_num_mentions:INTEGER', ]), create_disposition=( apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=( apache_beam.io.BigQueryDisposition.WRITE_APPEND))) p.run()
def __run_ingestion(self, storage_input_path, columns, output_table): (self.pipeline | output_table + ': read table ' >> ReadFromAvro(storage_input_path) | output_table + ': filter columns' >> beam.Map(self.__filter_columns, columns=columns) | output_table + ': write to BigQuery' >> beam.Write( beam.io.BigQuerySink(output_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--model', dest='model', required=True, help='Checkpoint file of the model.') parser.add_argument('--source', dest='source', required=True, help='Data source location (cs|bq).') known_args, pipeline_args = parser.parse_known_args(argv) if known_args.source == 'cs': def _to_dictionary(line): result = {} result['key'], result['image'] = line.split(':') return result p = beam.Pipeline(argv=pipeline_args) images = (p | 'ReadFromText' >> beam.io.ReadFromText(known_args.input) | 'ConvertToDict' >> beam.Map(_to_dictionary)) predictions = images | 'Prediction' >> beam.ParDo( PredictDoFn(), known_args.model) predictions | 'WriteToText' >> beam.io.WriteToText(known_args.output) else: schema = 'key:INTEGER' for i in range(10): schema += (', pred%d:FLOAT' % i) p = beam.Pipeline(argv=pipeline_args) images = p | 'ReadFromBQ' >> beam.Read( beam.io.BigQuerySource(known_args.input)) predictions = images | 'Prediction' >> beam.ParDo( PredictDoFn(), known_args.model) predictions | 'WriteToBQ' >> beam.Write( beam.io.BigQuerySink( known_args.output, schema=schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) logging.getLogger().setLevel(logging.INFO) p.run()
def main(src_path, dest_table, pipeline_args): p = apache_beam.Pipeline(argv=pipeline_args) value = p | 'Read JSON' >> apache_beam.Read(JsonFileSource(src_path)) value |= ( 'Remove records that lack location or year data' >> apache_beam.FlatMap(discard_incomplete)) value |= ( 'Convert string values to their types' >> apache_beam.Map(convert_types)) value |= ( 'Filter bad data' >> apache_beam.FlatMap(filter_suspicious)) value |= ( 'Massage fields with "rec" prefix' >> apache_beam.Map(massage_rec)) value |= ( 'Dump data to BigQuery' >> apache_beam.Write(apache_beam.io.BigQuerySink( dest_table, schema=', '.join([ 'fall:STRING', 'year:INTEGER', 'nametype:STRING', 'mass:FLOAT', 'name:STRING', 'class:STRING', 'latitude:FLOAT', 'longitude:FLOAT', 'id:STRING', ]), create_disposition=( apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=( apache_beam.io.BigQueryDisposition.WRITE_TRUNCATE)))) p.run()
(SELECT __key__.id as accnt_id FROM [lead-pages:leadpages.Account_cleansed] LIMIT 100)""" options = PipelineOptions(flags=sys.argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = project google_cloud_options.job_name = 'lp-analysis' google_cloud_options.staging_location = 'gs://lp_activity_transform/staging' google_cloud_options.temp_location = 'gs://lp_activity_transform/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' p = beam.Pipeline(options=options) (p | 'read' >> beam.Read(beam.io.BigQuerySource(query=input_query)) | 'cast ints' >> beam.Map(lambda row: (row['account_id'], int(row['views']))) | beam.CombinePerKey(sum) | 'format for gbq' >> beam.Map(lambda (k, v): { 'account_id': k, 'total_views': v }) | 'save' >> beam.Write( beam.io.BigQuerySink( output_table, schema='account_id:INTEGER, total_views:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run()
'gamma': 1.2 } model = xgb.train(best_params, dtrain, num_boost_round=1000, evals=watchlist, evals_result=evals_result, verbose_eval=True) test.loc[:, "predict"] = model.predict(dtest) return test[["shop_id", "date", "predict", "sales"]].to_dict(orient='records') (pipeline | "Query data" >> beam.Read(beam.io.BigQuerySource(query=query)) | "Assign time" >> beam.Map(assign_timevalue) | "Set window" >> beam.WindowInto(window.SlidingWindows(size=3, period=1)) | "Set group key" >> beam.Map(lambda v: ('shop_id', v)) | beam.GroupByKey() | "Learn and predict" >> beam.FlatMap(learn_predict) | "Write data" >> beam.Write( beam.io.BigQuerySink( 'dataset.table', schema="shop_id:STRING, date:STRING, predict:FLOAT, sales:INTEGER", write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))) pipeline.run()
import apache_beam as beam project = 'teak-proton-148317' input_table = 'clouddataflow-readonly:samples.weather_stations' output_table = 'mydataset.weather_copy_from_dataflow1' p = beam.Pipeline(argv=['--project', project]) read = beam.Read(beam.io.BigQuerySource(input_table)) tornadoesMonths = beam.FlatMap(lambda row: [(int(row['month']), 1)] if row['tornado'] else []) monthlyCount = beam.CombinePerKey(sum) frmat = beam.Map(lambda (k, v): {'month': k, 'tornado_count': v}) sve = beam.Write( beam.io.BigQuerySink( output_table, schema='month:INTEGER, tornado_count:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) (p | read | tornadoesMonths | monthlyCount | frmat | sve) p.run()
import apache_beam as beam p = beam.Pipeline('DirectPipelineRunner') # Read a file containing names, add a greeting to each name, and write to a file. """(p | 'load names' >> beam.Read(beam.io.TextFileSource('gs://dataflow-samples/shakespeare/kinglear.txt')) | 'add greeting' >> beam.Map(lambda name, msg: '%s, %s!' % (msg, name), 'Hello') | 'save' >> beam.Write(beam.io.TextFileSink('./greetings'))) """ read = beam.Read(beam.io.TextFileSource('./names')) maps = beam.Map(lambda a, b: "Hii:" + b, "Last Params") wr = beam.Write(beam.io.TextFileSink('./namesoutput4')) p | read | maps | wr p.run()
def write_text_file(pcollection, label, output_name, coder=beam.coders.ToStringCoder()): return pcollection | label >> beam.Write(beam.io.TextFileSink( os.path.join(args.output_dir, output_name), shard_name_template='', coder=coder))
google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'teak-proton-148317' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://sanjay-mybucket/binaries' google_cloud_options.temp_location = 'gs://sanjay-mybucket/temp' options.view_as(StandardOptions).runner = 'DataflowPipelineRunner' p = beam.Pipeline('DataflowPipelineRunner', options) (p | 'read' >> beam.Read( beam.io.TextFileSource('gs://dataflow-samples/shakespeare/kinglear.txt')) | 'split' >> beam.FlatMap(lambda x: re.findall(r'\w+', x)) | 'count words' >> beam.combiners.Count.PerElement() | 'save' >> beam.Write(beam.io.TextFileSink('./word_count'))) p.run() beam.combiners.Count.PerElement beam.core.ParDo beam.ParDo beam.core.ParDo beam.ParDo beam.combiners.core.ParDo