def testTFlitePredictExtractorWithKerasModel(self, multi_model, multi_output): input1 = tf.keras.layers.Input(shape=(1, ), name='input1') input2 = tf.keras.layers.Input(shape=(1, ), name='input2') inputs = [input1, input2] input_layer = tf.keras.layers.concatenate(inputs) output_layers = {} output_layers['output1'] = (tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid, name='output1')(input_layer)) if multi_output: output_layers['output2'] = (tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid, name='output2')(input_layer)) model = tf.keras.models.Model(inputs, output_layers) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) train_features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]} labels = {'output1': [[1], [0]]} if multi_output: labels['output2'] = [[1], [0]] example_weights = {'output1': [1.0, 0.5]} if multi_output: example_weights['output2'] = [1.0, 0.5] dataset = tf.data.Dataset.from_tensor_slices( (train_features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) converter = tf.compat.v2.lite.TFLiteConverter.from_keras_model(model) tflite_model = converter.convert() tflite_model_dir = tempfile.mkdtemp() with tf.io.gfile.GFile(os.path.join(tflite_model_dir, 'tflite'), 'wb') as f: f.write(tflite_model) model_specs = [ config_pb2.ModelSpec(name='model1', model_type='tf_lite') ] if multi_model: model_specs.append( config_pb2.ModelSpec(name='model2', model_type='tf_lite')) eval_config = config_pb2.EvalConfig(model_specs=model_specs) eval_shared_models = [ self.createTestEvalSharedModel( model_name='model1', eval_saved_model_path=tflite_model_dir, model_type='tf_lite') ] if multi_model: eval_shared_models.append( self.createTestEvalSharedModel( model_name='model2', eval_saved_model_path=tflite_model_dir, model_type='tf_lite')) schema = text_format.Parse( """ feature { name: "input1" type: FLOAT } feature { name: "non_model_feature" type: INT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) feature_extractor = features_extractor.FeaturesExtractor(eval_config) predictor = tflite_predict_extractor.TFLitePredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_models) examples = [ self._makeExample(input1=0.0, non_model_feature=0), self._makeExample(input1=1.0, non_model_feature=1), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform | predictor.stage_name >> predictor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got = got[0] self.assertIn(constants.PREDICTIONS_KEY, got) self.assertLen(got[constants.PREDICTIONS_KEY], 2) for item in got[constants.PREDICTIONS_KEY]: if multi_model: self.assertIn('model1', item) self.assertIn('model2', item) if multi_output: self.assertIn('Identity', item['model1']) self.assertIn('Identity_1', item['model1']) elif multi_output: self.assertIn('Identity', item) self.assertIn('Identity_1', item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, # Note that since everything runs in-process this doesn't # actually test that the py_func can be correctly recreated # on workers in a distributed context. _addPyFuncMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight(example_weight_key='age') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) (metrics, plots), _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator .ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, 'py_func_label_sum': 2.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input file to process.') parser.add_argument('--output-file', dest='output_file', help='Output file to write results to.') parser.add_argument('--output-topic', dest='output_topic', help='Output topic to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DirectRunner', '--project=global-datacenter', # '--staging_location=/tmp/beam/staging', # '--temp_location=/tmp/beam/tmp', '--job_name=parse-twitter-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: def as_feed(top): return json.dumps({ "version": "https://jsonfeed.org/version/1", "title": "Trending Twitter Keywords", "home_page_url": "https://example.org/", "feed_url": "https://example.org/feed.json", "items": [ { "id": row[0], "content_text": f"Keyword '{row[0]}' counted {row[1]}", "url": f"https://twitter.com/search?q={row[0]}" # TODO security: urlencode keyword } for row in top ] }) texts = (p | 'Read' >> ReadFromText(known_args.input) | 'FromJSON' >> beam.Map(json.loads) | 'GetTexts' >> beam.Map(lambda x: x['data']['text'])) feed = ( texts | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[@#\w\']{6,}', x, re.UNICODE)).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum) | 'Top10' >> beam.transforms.combiners.Top.Of(10, key=lambda x: x[1]) | 'AsFeed' >> beam.Map(as_feed)) if known_args.output_file: unused = (feed | WriteToText(known_args.output_file)) if known_args.output_topic: unused = ( feed | 'Encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes) | 'Publish' >> beam.io.WriteToPubSub(known_args.output_topic))
def test_test_transform(self): with beam.Pipeline() as p: assert_that( p | beam.Create(['a', 'b', 'c']) | _TestTransform('x', 'y'), equal_to(['xay', 'xby', 'xcy']))
def test_iobase_source(self): with beam.Pipeline(argv=self.args) as p: result = (p | 'read' >> beam.io.ReadFromBigQuery( query=self.query, use_standard_sql=True, project=self.project)) assert_that(result, equal_to(self.get_expected_data()))
# coding=utf-8 import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument( '--output', default='./output.txt', help='Output for the pipeline') if __name__ == '__main__': options = MyOptions() options.view_as(beam.options.pipeline_options.StandardOptions).runner = 'DirectRunner' p = beam.Pipeline(options=options) (p | 'create numbers' >> beam.Create([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) | 'combine numbers' >> beam.CombineGlobally(sum) | 'write to text' >> beam.io.WriteToText(options.output, shard_name_template="")) p.run().wait_until_finish()
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() parser.add_argument('--model_identifier', required=True, help='Used to identify LTV model and append to table and file names', default='by_sample_id') #filter-campaign parser.add_argument('--stage-bucket', dest='stage_bucket', required=False, help='Staging bucket to use', default='ltv-dataflow') parser.add_argument('--data-bucket', dest='data_bucket', required=False, help='Data bucket to use', default='telemetry-to-gcp') parser.add_argument('--load_bq', required=False, help='True/False to load summary/details to bq', default=False) parser.add_argument('--estimate_model', required=False, help='True/False to estimate model params', default=False) parser.add_argument('--calculate_model', required=False, help='True/False to calculate ltv', default=False) parser.add_argument('--calculate_stats', required=False, help='True/False to calculate stats', default=False) parser.add_argument('--upload_stats', required=False, help='True/False to calculate stats', default=False) # to be merged in calc stats once i figure out wild card stuff parser.add_argument('--send_output', required=False, help='True/False to send ltv and aggr files to Marketing GCP', default=False) parser.add_argument('--delete_data', required=False, help='True/False to delete input data and BQ data', default=False) start = time.clock() # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) logging.info('running beam_calc for model: ' + known_args.model_identifier) # read in the output bg table schema bg_out_schema= '' #schema_file = "gs://ltv-dataflow-dev/templates/input/calc_output_schema.json" schema_file = 'gs://{}/templates/input/calc_output_schema.json'.format(known_args.stage_bucket) with gcs.open(schema_file) as f: data = f.read() # Wrapping the schema in fields is required for the BigQuery API. bg_out_schema = '{"fields": ' + data + '}' schema = parse_table_schema_from_json(bg_out_schema) #schema = bigquery_tools.parse_table_schema_from_json(bg_out_schema) #logging.info(schema) #ltv_beam.py:306: BeamDeprecationWarning: parse_table_schema_from_json is deprecated since 2.11.0. Use bigquery_tools.parse_table_schema_from_json instead. # estimate LTV model min_sample_size = calculate_min_sample_size() #estimate_model(min_sample_size) pipeline_options = PipelineOptions(pipeline_args) #pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=PipelineOptions(pipeline_args)) # to be replaced with direct telem bq table to dataflow input? input_file_dir = 'gs://' + known_args.data_bucket + '/clv/' + known_args.model_identifier.replace('_','-') + '/{0}/*.parquet' # load parquet files into bq, overwrite (may not need if we can join datasets in Telemetry? hmmm, no, we need to get random selection) # use dummy singleton DoFn if no Parquet reader # much faster to directly load summary and details to bq - need in bq for quick join (not sure how fast join is as a pcollection) if known_args.load_bq: logging.info('loading bq') load_data_bq('ltv', 'summary_' + known_args.model_identifier, input_file_dir.format('summary')) load_data_bq('ltv', 'details_' + known_args.model_identifier, input_file_dir.format('details')) # run this to estimate LTV model parameters if known_args.estimate_model: logging.info('estimate model parameters'+known_args.estimate_model) serial_dummy = p | 'Read' >> beam.Create( ['serial_dummy'] ) | 'Estimate Lifetimes Model' >> beam.ParDo(ltv_calculate.EstimateLTVFn(min_sample_size,output_bucket='gs://ltv-dataflow-dev/tmp/',model_tag=known_args.model_identifier)) p.run().wait_until_finish() # fun estimation first before moving on to calculation # this has to be run on dataflow or will not upload to bq if known_args.calculate_model: logging.info('calculate ltv') data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier) # + " ORDER BY RAND() LIMIT {}").format(100) ##data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier + " ORDER BY RAND() LIMIT {}").format(100) (p | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=data_query, use_standard_sql=True)) | 'Apply Lifetimes Model' >> beam.ParDo( ltv_calculate.CalcLTVFn( dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/bgf_" + known_args.model_identifier + ".pkl", 'rb')), dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/ggf_" + known_args.model_identifier + ".pkl", 'rb')) ) ) | 'Write Data to BigQuery' >> beam.io.WriteToBigQuery("ltv.calc_" + known_args.model_identifier, schema=schema,create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) ) p.run().wait_until_finish() # calc aggregate statistics if known_args.calculate_stats: logging.info('aggregate ltv') qry_outlier_stats = ("""SELECT count(distinct(client_id)) ct, avg(historical_searches) avg, stddev(historical_searches) std FROM ltv.summary_{0} WHERE client_id not in (SELECT client_id FROM ltv.details_{0} WHERE default_search_engine='google-nocodes')""").format(known_args.model_identifier) query_job = bq_client.query(qry_outlier_stats) outlier_stats = query_job.to_dataframe() # no need to go through query_job.result() #logging.info(outlier_stats.head(1)) ct = outlier_stats['ct'][0] if ct > 0: mu = outlier_stats['avg'][0] sigma = outlier_stats['std'][0] outliers_upper = str(mu + 2.5 * sigma) outliers_lower = str(mu - 2.5 * sigma) # anyway to specify computation/mem instensive machine here? if known_args.model_identifier=='filter_campaign': sid_list = [''] else: sid_list = ['10','33','53','89'] for sid in sid_list: if known_args.model_identifier=='filter_campaign': sid_qry = '' assert sid == '' else: sid_qry = 'AND l.sample_id='+sid (p | 'Read_Age'+sid >> beam.Create( ['another_serial_dummy_age'+sid] ) | 'AggrCustomerAgeFn'+sid >> beam.ParDo(ltv_aggregate.AggrCustomerAgeFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_E10'+sid >> beam.Create( ['another_serial_dummy_e10'+sid] ) | 'AggrE10ActivityGroupLocaleFn'+sid >> beam.ParDo(ltv_aggregate.AggrE10ActivityGroupLocaleFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Geo'+sid >> beam.Create( ['another_serial_dummy_geo'+sid] ) | 'AggrGlobalGeoUserStatusFn'+sid >> beam.ParDo(ltv_aggregate.AggrGlobalGeoUserStatusFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_City'+sid >> beam.Create( ['another_serial_dummy_city'+sid] ) | 'AggrCityFn'+sid >> beam.ParDo(ltv_aggregate.AggrCityFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_ECB'+sid >> beam.Create( ['another_serial_dummy_ecb'+sid] ) | 'AggrEngineChannelBrowserFn'+sid >> beam.ParDo(ltv_aggregate.AggrEngineChannelBrowserFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Mem'+sid >> beam.Create( ['another_serial_dummy_mem'+sid] ) | 'AggrMemoryFn'+sid >> beam.ParDo(ltv_aggregate.AggrMemoryFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Attr1'+sid >> beam.Create( ['another_serial_dummy_attr1'+sid] ) | 'AggrAttributes1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Attr2'+sid >> beam.Create( ['another_serial_dummy_attr2'+sid] ) | 'AggrAttributes2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Attr3'+sid >> beam.Create( ['another_serial_dummy_attr3'+sid] ) | 'AggrAttributes3Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes3Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_OS1'+sid >> beam.Create( ['another_serial_dummy_os1'+sid] ) | 'AggrOS1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_OS2'+sid >> beam.Create( ['another_serial_dummy_os2'+sid] ) | 'AggrOS2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_SCD'+sid >> beam.Create( ['another_serial_dummy_scd'+sid] ) | 'AggrSyncConfiguredDesktopFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredDesktopFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_SCM'+sid >> beam.Create( ['another_serial_dummy_scm'+sid] ) | 'AggrSyncConfiguredMobileFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredMobileFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_BC'+sid >> beam.Create( ['another_serial_dummy_bc'+sid] ) | 'AggrBookmarksCountFn'+sid >> beam.ParDo(ltv_aggregate.AggrBookmarksCountFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) # compute ltv statistics # quantile compute https://stackoverflow.com/questions/46827512/efficient-algorithm-for-computing-quantiles-in-terabytes-dataset p.run().wait_until_finish() if known_args.upload_stats: # delete anything from bq aggr table before running aggr stats qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) # load any files in ltv-dataflow/tmp containing self.model_tag; and delete them dataset_ref = bq_client.dataset('ltv') job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.skip_leading_rows = 1 # The source format defaults to CSV, so the line below is optional. job_config.source_format = bigquery.SourceFormat.CSV uri = "gs://" + known_args.stage_bucket + "/tmp/" + known_args.model_identifier + '' + "/aggr_*.csv" try: load_job = bq_client.load_table_from_uri(uri, dataset_ref.table('aggr_'+known_args.model_identifier), job_config=job_config) # API request print("Starting job {}".format(load_job.job_id)) load_job.result() # Waits for table load to complete. print("Job finished.") destination_table = bq_client.get_table(dataset_ref.table('aggr_'+known_args.model_identifier)) print("Loaded {} rows.".format(destination_table.num_rows)) except ImportError: logging.info('uri: ' + uri) # push files to Marketing GCP in a GCF? # output parquet files if known_args.send_output: # test dataflow write parquet files --- SUCCESSFUL! we can output files in parquet format calc_query = ("SELECT client_id, sample_id FROM ltv.summary_{} LIMIT 10").format(known_args.model_identifier) #data_query = ("SELECT summ.*, det.* EXCEPT (client_id) FROM ltv.summary summ LEFT JOIN ltv.details det ON summ.client_id=det.client_id WHERE summ.client_id in ('3691929b0e07e22c86c1167c83ded58f481caf89','64ba414c3820805b1f64021cf3e082b091dec4f4')") # ugh need to differentiate by sample id sid = '' calc_fn = 'gs://' + known_args.stage_bucket + '/output/ltv_calc_' + known_args.model_identifier + sid (p | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=calc_query, use_standard_sql=True)) | 'Write Data to Parquet' >> beam.io.WriteToParquet(calc_fn, pa.schema([('client_id', pa.string()), ('sample_id', pa.int32())])) ) # https://arrow.apache.org/docs/python/data.html#type-metadata p.run().wait_until_finish() # if file exists, push(move) to marketing, else throw error # if all files are in output dir, then clean up bq and put _SUCCESS in dir which should trigger gcf # delete intermediary data from bq (and gcs?) if known_args.send_output: # delete anything in bq ltv tables qry_truncate = ("DELETE from ltv.summary_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) qry_truncate = ("DELETE from ltv.details_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) qry_truncate = ("DELETE from ltv.calc_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) # delete gcs data files? logging.info('ltv.run() runtime: ' + str(time.clock() - start))
def run(argv=None, save_main_session=True): """Main entry point to pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--corpus_home', dest='corpus_home', help='The directory or bucke of the corpus home') parser.add_argument('--input', dest='input', help='A single input file') parser.add_argument('--corpus_prefix', dest='corpus_prefix', help='Prefix after corpus home where the files are') parser.add_argument('--ignorelines', dest='ignorelines', help='Ignore lines containing these words') parser.add_argument('--output', dest='output', required=True, help='Output file') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) ignorepatterns = [] if known_args.ignorelines: ignorepatterns = load_ignore(known_args.ignorelines) if known_args.corpus_home: logging.info('corpus_home: %s', known_args.corpus_home) corpus_data_dir = '{}/data/corpus'.format(known_args.corpus_home) corpus_index = '{}/collections.csv'.format(corpus_data_dir) corpus_dir = known_args.corpus_home if known_args.corpus_prefix: corpus_dir = '{}/{}'.format(known_args.corpus_home, known_args.corpus_prefix) lines = (p | 'read_top_index' >> ReadFromText(corpus_index) | 'split_top_index' >> beam.ParDo(ExtractIndexEntry()) | 'add_prefix_corpus_data' >> beam.FlatMap(add_prefix, corpus_data_dir) | 'read_secondary_index' >> ReadAllFromText() | 'split_secondary_index' >> beam.ParDo(ExtractIndexEntry()) | 'add_prefix_corpus_dir' >> beam.FlatMap(add_prefix, corpus_dir) | 'read_files' >> ReadAllFromText()) else: lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each character. def count_ones(char_ones): (c, ones) = char_ones return (c, sum(ones)) # Ignore counts for lines that are boilerplate (copyright notices, etc) re_patterns = [] for val in ignorepatterns: pat = '.*{}.*'.format(val) re_patterns.append(re.compile(pat, re.IGNORECASE)) def not_boilerplate(line): """true if the line does not match a boilerplate pattern """ for re_pattern in re_patterns: if re_pattern.match(line) != None: return False return True counts = (lines | 'filter' >> beam.Filter(not_boilerplate) | 'split' >> (beam.ParDo(CharBigramExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the result def format_result(char_bigram_count): (char_bigram, count) = char_bigram_count return '%s\t%d' % (char_bigram, count) output = counts | 'format' >> beam.Map(format_result) output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() if (not hasattr(result, 'has_job') or result.has_job): char_bigram_filter = MetricsFilter().with_name('char_bigrams') query_result = result.metrics().query(char_bigram_filter) if query_result['counters']: char_bigram_counter = query_result['counters'][0] logging.info('Total char bigrams: %d', char_bigram_counter.result)
def test_source_transform(self): path = self._write_data() with beam.Pipeline('DirectRunner') as p: assert_that(p | avroio.ReadFromAvro(path), equal_to(self.RECORDS))
def model_custom_source(count): """Demonstrates creating a new custom source and using it in a pipeline. Defines a new source ``CountingSource`` that produces integers starting from 0 up to a given size. Uses the new source in an example pipeline. Additionally demonstrates how a source should be implemented using a ``PTransform``. This is the recommended way to develop sources that are to distributed to a large number of end users. This method runs two pipelines. (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read`` transform. (2) A pipeline that uses a custom ``PTransform`` that wraps ``CountingSource``. Args: count: the size of the counting source to be used in the pipeline demonstrated in this method. """ # Using the source in an example pipeline. # [START model_custom_source_use_new_source] with beam.Pipeline(options=PipelineOptions()) as p: numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count)) # [END model_custom_source_use_new_source] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) assert_that( lines, equal_to(['line ' + str(number) for number in range(0, count)])) # We recommend users to start Source classes with an underscore to discourage # using the Source class directly when a PTransform for the source is # available. We simulate that here by simply extending the previous Source # class. class _CountingSource(CountingSource): pass # [START model_custom_source_new_ptransform] class ReadFromCountingSource(PTransform): def __init__(self, count, **kwargs): super(ReadFromCountingSource, self).__init__(**kwargs) self._count = count def expand(self, pcoll): return pcoll | iobase.Read(_CountingSource(count)) # [END model_custom_source_new_ptransform] # [START model_custom_source_use_ptransform] p = beam.Pipeline(options=PipelineOptions()) numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count) # [END model_custom_source_use_ptransform] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) assert_that( lines, equal_to(['line ' + str(number) for number in range(0, count)])) # Don't test runner api due to pickling errors. p.run(test_runner_api=False).wait_until_finish()
def model_custom_sink(simplekv, KVs, final_table_name_no_ptransform, final_table_name_with_ptransform): """Demonstrates creating a new custom sink and using it in a pipeline. Defines a new sink ``SimpleKVSink`` that demonstrates writing to a simple key-value based storage system which has following API. simplekv.connect(url) - connects to the storage system and returns an access token which can be used to perform further operations simplekv.open_table(access_token, table_name) - creates a table named 'table_name'. Returns a table object. simplekv.write_to_table(access_token, table, key, value) - writes a key-value pair to the given table. simplekv.rename_table(access_token, old_name, new_name) - renames the table named 'old_name' to 'new_name'. Uses the new sink in an example pipeline. Additionally demonstrates how a sink should be implemented using a ``PTransform``. This is the recommended way to develop sinks that are to be distributed to a large number of end users. This method runs two pipelines. (1) A pipeline that uses ``SimpleKVSink`` directly using the ``df.Write`` transform. (2) A pipeline that uses a custom ``PTransform`` that wraps ``SimpleKVSink``. Args: simplekv: an object that mocks the key-value storage. KVs: the set of key-value pairs to be written in the example pipeline. final_table_name_no_ptransform: the prefix of final set of tables to be created by the example pipeline that uses ``SimpleKVSink`` directly. final_table_name_with_ptransform: the prefix of final set of tables to be created by the example pipeline that uses a ``PTransform`` that wraps ``SimpleKVSink``. """ import apache_beam as beam from apache_beam.io import iobase from apache_beam.transforms.core import PTransform from apache_beam.options.pipeline_options import PipelineOptions # Defining the new sink. # [START model_custom_sink_new_sink] class SimpleKVSink(iobase.Sink): def __init__(self, url, final_table_name): self._url = url self._final_table_name = final_table_name def initialize_write(self): access_token = simplekv.connect(self._url) return access_token def open_writer(self, access_token, uid): table_name = 'table' + uid return SimpleKVWriter(access_token, table_name) def finalize_write(self, access_token, table_names): for i, table_name in enumerate(table_names): simplekv.rename_table(access_token, table_name, self._final_table_name + str(i)) # [END model_custom_sink_new_sink] # Defining a writer for the new sink. # [START model_custom_sink_new_writer] class SimpleKVWriter(iobase.Writer): def __init__(self, access_token, table_name): self._access_token = access_token self._table_name = table_name self._table = simplekv.open_table(access_token, table_name) def write(self, record): key, value = record simplekv.write_to_table(self._access_token, self._table, key, value) def close(self): return self._table_name # [END model_custom_sink_new_writer] final_table_name = final_table_name_no_ptransform # Using the new sink in an example pipeline. # [START model_custom_sink_use_new_sink] with beam.Pipeline(options=PipelineOptions()) as p: kvs = p | 'CreateKVs' >> beam.Create(KVs) kvs | 'WriteToSimpleKV' >> beam.io.Write( SimpleKVSink('http://url_to_simple_kv/', final_table_name)) # [END model_custom_sink_use_new_sink] # We recommend users to start Sink class names with an underscore to # discourage using the Sink class directly when a PTransform for the sink is # available. We simulate that here by simply extending the previous Sink # class. class _SimpleKVSink(SimpleKVSink): pass # [START model_custom_sink_new_ptransform] class WriteToKVSink(PTransform): def __init__(self, url, final_table_name, **kwargs): super(WriteToKVSink, self).__init__(**kwargs) self._url = url self._final_table_name = final_table_name def expand(self, pcoll): return pcoll | iobase.Write( _SimpleKVSink(self._url, self._final_table_name)) # [END model_custom_sink_new_ptransform] final_table_name = final_table_name_with_ptransform # [START model_custom_sink_use_ptransform] with beam.Pipeline(options=PipelineOptions()) as p: kvs = p | 'CreateKVs' >> beam.core.Create(KVs) kvs | 'WriteToSimpleKV' >> WriteToKVSink('http://url_to_simple_kv/', final_table_name)
for f in [18, 20, 21]: #wheelson, crsarrtime, arrtime fields[f], arrtz = as_utc(fields[0], fields[f], arr_timezone) for f in [17, 18, 20, 21]: fields[f] = add_24h_if_before(fields[f], fields[14]) fields.extend(airport_timezones[dep_airport_id]) fields[-1] = str(deptz) fields.extend(airport_timezones[arr_airport_id]) fields[-1] = str(arrtz) yield ','.join(fields) if __name__ == '__main__': with beam.Pipeline('DirectRunner') as pipeline: airports = (pipeline | 'airports:read' >> beam.io.ReadFromText('airports.csv.gz') | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line]))) | 'airports:tz' >> beam.Map(lambda fields: (fields[0], addtimezone(fields[21], fields[26])))) flights = (pipeline | 'flights:read' >> beam.io.ReadFromText('201501_part.csv') | 'flights:tzcorr' >> beam.FlatMap( tz_correct, beam.pvalue.AsDict(airports)))
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') parser.add_argument('--subscription', type=str, help='Pub/Sub subscription to read from') parser.add_argument('--dataset', type=str, required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument( '--table_name', type=str, default='game_stats', help='The BigQuery table name. Should not already exist.') parser.add_argument('--fixed_window_duration', type=int, default=60, help='Numeric value of fixed window duration for user ' 'analysis, in minutes') parser.add_argument('--session_gap', type=int, default=5, help='Numeric value of gap between user sessions, ' 'in minutes') parser.add_argument( '--user_activity_window_duration', type=int, default=30, help='Numeric value of fixed window for finding mean of ' 'user session duration, in minutes') args, pipeline_args = parser.parse_known_args(argv) if args.topic is None and args.subscription is None: parser.print_usage() print(sys.argv[0] + ': error: one of --topic or --subscription is required') sys.exit(1) options = PipelineOptions(pipeline_args) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) fixed_window_duration = args.fixed_window_duration * 60 session_gap = args.session_gap * 60 user_activity_window_duration = args.user_activity_window_duration * 60 # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True # Enforce that this pipeline is always run in streaming mode options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read game events from Pub/Sub using custom timestamps, which # are extracted from the data elements, and parse the data. if args.subscription: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=args.subscription) else: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=args.topic) raw_events = (scores | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) | 'AddEventTimestamps' >> beam.Map(lambda elem: beam.window.TimestampedValue( elem, elem['timestamp']))) # Extract username/score pairs from the event stream user_events = (raw_events | 'ExtractUserScores' >> beam.Map(lambda elem: (elem['user'], elem['score']))) # Calculate the total score per user over fixed windows, and cumulative # updates for late data spammers_view = ( user_events | 'UserFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. # These might be robots/spammers. | 'CalculateSpammyUsers' >> CalculateSpammyUsers() # Derive a view from the collection of spammer users. It will be used as # a side input in calculating the team score sums, below | 'CreateSpammersView' >> beam.CombineGlobally( beam.combiners.ToDictCombineFn()).as_singleton_view()) # [START filter_and_calc] # Calculate the total score per team over fixed windows, and emit cumulative # updates for late data. Uses the side input derived above --the set of # suspected robots-- to filter out scores from those users from the sum. # Write the results to BigQuery. (raw_events # pylint: disable=expression-not-assigned | 'WindowIntoFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out the detected spammer users, using the side input derived above | 'FilterOutSpammers' >> beam.Filter( lambda elem, spammers: elem['user'] not in spammers, spammers_view) # Extract and sum teamname/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('team') # [END filter_and_calc] | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.table_name + '_teams', args.dataset, { 'team': 'STRING', 'total_score': 'INTEGER', 'window_start': 'STRING', 'processing_time': 'STRING', }, options.view_as(GoogleCloudOptions).project)) # [START session_calc] # Detect user sessions-- that is, a burst of activity separated by a gap # from further activity. Find and record the mean session lengths. # This information could help the game designers track the changing user # engagement as their set of game changes. (user_events # pylint: disable=expression-not-assigned | 'WindowIntoSessions' >> beam.WindowInto( beam.window.Sessions(session_gap), timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW) # For this use, we care only about the existence of the session, not any # particular information aggregated over it, so we can just group by key # and assign a "dummy value" of None. | beam.CombinePerKey(lambda _: None) # Get the duration of the session | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity()) # [END session_calc] # [START rewindow] # Re-window to process groups of session sums according to when the # sessions complete | 'WindowToExtractSessionMean' >> beam.WindowInto( beam.window.FixedWindows(user_activity_window_duration)) # Find the mean session duration in each window | beam.CombineGlobally( beam.combiners.MeanCombineFn()).without_defaults() | 'FormatAvgSessionLength' >> beam.Map(lambda elem: {'mean_duration': float(elem)}) | 'WriteAvgSessionLength' >> WriteToBigQuery( args.table_name + '_sessions', args.dataset, { 'mean_duration': 'FLOAT', }, options.view_as(GoogleCloudOptions).project))
record = element name = record.get('name') newname = name.strip('""') record['newname'] = newname return [record] PROJECT_ID = os.environ['PROJECT_ID'] # Project ID is needed for BigQuery data source, even for local execution. options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) # Create a Pipeline using a local runner for execution. with beam.Pipeline('DirectRunner', options=opts) as p: # Select data from Dish table in BigQuery query_results = p | 'Read from BigQuery' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT * FROM dataset2.Business limit 100')) # write PCollection to log file query_results | 'Write to log 1' >> WriteToText('query_results.txt') # apply a ParDo to the PCollection Bis_pcoll = query_results | 'Normalize Business' >> beam.ParDo(Oops()) # write PCollection to a file Bis_pcoll | 'Write File' >> WriteToText('Bus_output.txt')
def testWriteIgnoresMissingKeys(self): with beam.Pipeline() as pipeline: test = pipeline | beam.Create(['test']) # PTransform is None so this will throw exception if it tries to run _ = {'test': test} | writer.Write('key-does-not-exist', None)
def run(): client_bq = bigquery.Client.from_service_account_json(args.local_sa_key, location=args.location) bigquery_asset_list = [ ('logs', 'events_logs_function_native', 'event_ds'), ('logs', 'events_debug_function_native', 'event_ds'), ('logs', 'events_logs_dataflow_backfill', 'event_ds'), ('events', 'events_function_native', 'event_timestamp')] try: source_bigquery_assets(client_bq, bigquery_asset_list) except Exception: generate_bigquery_assets(client_bq, bigquery_asset_list) # https://github.com/apache/beam/blob/master/sdks/python/apache_beam/options/pipeline_options.py po = PipelineOptions() job_name = 'p1-gcs-to-bq-{method}-backfill-{environment_name}-{event_category}-{event_ds_start}-to-{event_ds_stop}-{event_time}-{ts}'.format( method=method, environment_name=environment_name, event_category=args.event_category.replace('_', '-'), event_ds_start=args.event_ds_start, event_ds_stop=args.event_ds_stop, event_time=time_part_name, ts=str(int(time.time()))) # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params pipeline_options = po.from_dictionary({ 'project': args.gcp, 'staging_location': 'gs://{bucket_name}/data_type=dataflow/batch/staging/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name), 'temp_location': 'gs://{bucket_name}/data_type=dataflow/batch/temp/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name), 'runner': args.execution_environment, # {DirectRunner, DataflowRunner} 'setup_file': args.setup_file, 'service_account_email': 'dataflow-batch@{gcp_project_id}.iam.gserviceaccount.com'.format(gcp_project_id=args.gcp), 'job_name': job_name, 'region': args.gcp_region }) pipeline_options.view_as(SetupOptions).save_main_session = True p1 = beam.Pipeline(options=pipeline_options) fileListGcs = (p1 | 'CreateGcsIterators' >> beam.Create(list(generate_gcs_file_list(args.bucket_name, environment_list, category_list, args.event_ds_start, args.event_ds_stop, time_part_list, args.scale_test_name))) | 'GetGcsFileList' >> beam.ParDo(GetGcsFileList()) | 'GcsListPairWithOne' >> beam.Map(lambda x: (x, 1))) fileListBq = (p1 | 'ParseBqFileList' >> beam.io.Read(beam.io.BigQuerySource( # "What is already in BQ?" query=generate_backfill_query( args.gcp, method, (safe_convert_list_to_sql_tuple(environment_list), environment_name), (safe_convert_list_to_sql_tuple(category_list), category_name), args.event_ds_start, args.event_ds_stop, (safe_convert_list_to_sql_tuple(time_part_list), time_part_name), args.scale_test_name), use_standard_sql=True)) | 'BqListPairWithOne' >> beam.Map(lambda x: (x['gspath'], 1))) parseList = ({'fileListGcs': fileListGcs, 'fileListBq': fileListBq} | 'CoGroupByKey' >> beam.CoGroupByKey() | 'UnionMinusIntersect' >> beam.Filter(lambda x: (len(x[1]['fileListGcs']) == 1 and len(x[1]['fileListBq']) == 0)) | 'ExtractKeysParseList' >> beam.Map(lambda x: x[0])) # Write to BigQuery: logsList = (parseList | 'AddParseInitiatedInfo' >> beam.Map(lambda gspath: {'job_name': job_name, 'processed_timestamp': time.time(), 'batch_id': hashlib.md5(gspath.encode('utf-8')).hexdigest(), 'analytics_environment': parse_gspath(gspath, 'analytics_environment='), 'event_category': parse_gspath(gspath, 'event_category='), 'event_ds': parse_gspath(gspath, 'event_ds='), 'event_time': parse_gspath(gspath, 'event_time='), 'event': 'parse_initiated', 'gspath': gspath}) | 'WriteParseInitiated' >> beam.io.WriteToBigQuery(table='events_logs_dataflow_backfill', dataset='logs', project=args.gcp, method='FILE_LOADS', create_disposition=beam.io.gcp.bigquery.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.gcp.bigquery.BigQueryDisposition.WRITE_APPEND, insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.RETRY_ON_TRANSIENT_ERROR, schema='job_name:STRING,processed_timestamp:TIMESTAMP,batch_id:STRING,analytics_environment:STRING,event_category:STRING,event_ds:DATE,event_time:STRING,event:STRING,gspath:STRING')) # Write to Pub/Sub: PDone = (parseList | 'DumpParseListPubSub' >> beam.io.WriteToText('gs://{bucket_name}/data_type=dataflow/batch/output/{job_name}/parselist'.format(bucket_name=args.bucket_name, job_name=job_name)) | 'WriteToPubSub' >> beam.ParDo(WriteToPubSub(), job_name, args.topic, args.gcp, args.bucket_name)) p1.run().wait_until_finish() return job_name
def testPreprocessingFn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils_bqml._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_output_path = os.path.join(working_dir, 'transform_output') transformed_examples_path = os.path.join(working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(feature_spec)) decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = ( p | 'ReadTrainData' >> beam.io.ReadFromTFRecord( os.path.join(self._testdata_path, 'csv_example_gen/train/*'), coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) | 'DecodeTrainData' >> beam.Map(decoder.decode)) (transformed_examples, transformed_metadata), transform_fn = ( (examples, legacy_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils_bqml.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path)) encoder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_output/transformed_metadata/schema.pbtxt' ), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_output_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. for feature in transformed_schema.feature: feature.ClearField('annotation') transformed_schema.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Ensure that the experiment flag is set explicitly by the user. debug_options = pipeline_options.view_as(DebugOptions) use_fn_api = (debug_options.experiments and 'beam_fn_api' in debug_options.experiments) assert use_fn_api, 'Enable beam_fn_api experiment, in order run this example.' # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) counts = ( lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(six.text_type)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group_and_sum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) # pylint: disable=unused-variable output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned # TODO(BEAM-2887): Enable after the issue is fixed. # output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.committed.mean)
header_str = ','.join(bq_table_columns) if header_str == text: return False else: return True # Convert the csv string into a python dictionary def str_to_dict(text): vals_list = text.split(',') vals_dict = {} for val, col in zip(vals_list, bq_table_columns): vals_dict[col] = val return vals_dict # Define pipeline steps in pipeline object (p) with beam.Pipeline(options=options) as p: pipe = ( p | "Input" >> beam.io.ReadFromText( 'gs://fw-etl-tmp-prod/FWrates_tender_zip3_forecast_mu.csv') | "Remove Header" >> beam.Filter(is_data) | "Convert To Dict" >> beam.Map(str_to_dict) | "Load BQ table" >> beam.io.WriteToBigQuery( forecast_table_spec, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER) #| beam.Map(print) )
def generate_examples(input_transform, output_dir, problem_name, splits, min_hop_size_seconds, max_hop_size_seconds, num_replications, min_pitch, max_pitch, encode_performance_fn, encode_score_fns=None, augment_fns=None, absolute_timing=False, random_crop_length=None): """Generate data for a Score2Perf problem. Args: input_transform: The input PTransform object that reads input NoteSequence protos, or dictionary mapping split names to such PTransform objects. Should produce `(id, NoteSequence)` tuples. output_dir: The directory to write the resulting TFRecord file containing examples. problem_name: Name of the Tensor2Tensor problem, used as a base filename for generated data. splits: A dictionary of split names and their probabilities. Probabilites should add up to 1. If `input_filename` is a dictionary, this argument will be ignored. min_hop_size_seconds: Minimum hop size in seconds at which input NoteSequence protos can be split. Can also be a dictionary mapping split name to minimum hop size. max_hop_size_seconds: Maximum hop size in seconds at which input NoteSequence protos can be split. If zero or None, will not split at all. Can also be a dictionary mapping split name to maximum hop size. num_replications: Number of times input NoteSequence protos will be replicated prior to splitting. min_pitch: Minimum MIDI pitch value; notes with lower pitch will be dropped. max_pitch: Maximum MIDI pitch value; notes with greater pitch will be dropped. encode_performance_fn: Required performance encoding function. encode_score_fns: Optional dictionary of named score encoding functions. augment_fns: Optional list of data augmentation functions. Only applied in the 'train' split. absolute_timing: If True, each score will use absolute instead of tempo- relative timing. Since chord inference depends on having beats, the score will only contain melody. random_crop_length: If specified, crop each encoded performance to this length. Cannot be specified if using scores. Raises: ValueError: If split probabilities do not add up to 1, or if splits are not provided but `input_filename` is not a dictionary. """ # Make sure Beam's log messages are not filtered. logging.getLogger().setLevel(logging.INFO) if isinstance(input_transform, dict): split_names = input_transform.keys() else: if not splits: raise ValueError( 'Split probabilities must be provided if input is not presplit.' ) split_names, split_probabilities = zip(*splits.items()) cumulative_splits = list( zip(split_names, np.cumsum(split_probabilities))) if cumulative_splits[-1][1] != 1.0: raise ValueError('Split probabilities must sum to 1; got %f' % cumulative_splits[-1][1]) # Check for existence of prior outputs. Since the number of shards may be # different, the prior outputs will not necessarily be overwritten and must # be deleted explicitly. output_filenames = [ os.path.join(output_dir, '%s-%s.tfrecord' % (problem_name, split_name)) for split_name in split_names ] for split_name, output_filename in zip(split_names, output_filenames): existing_output_filenames = tf.gfile.Glob(output_filename + '*') if existing_output_filenames: tf.logging.info( 'Data files already exist for split %s in problem %s, deleting.', split_name, problem_name) for filename in existing_output_filenames: tf.gfile.Remove(filename) pipeline_options = beam.options.pipeline_options.PipelineOptions( FLAGS.pipeline_options.split(',')) with beam.Pipeline(options=pipeline_options) as p: if isinstance(input_transform, dict): # Input data is already partitioned into splits. split_partitions = [ p | 'input_transform_%s' % split_name >> input_transform[split_name] for split_name in split_names ] else: # Read using a single PTransform. p |= 'input_transform' >> input_transform split_partitions = p | 'partition' >> beam.Partition( functools.partial(select_split, cumulative_splits), len(cumulative_splits)) for split_name, output_filename, s in zip(split_names, output_filenames, split_partitions): if isinstance(min_hop_size_seconds, dict): min_hop = min_hop_size_seconds[split_name] else: min_hop = min_hop_size_seconds if isinstance(max_hop_size_seconds, dict): max_hop = max_hop_size_seconds[split_name] else: max_hop = max_hop_size_seconds s |= 'preshuffle_%s' % split_name >> beam.Reshuffle() s |= 'filter_invalid_notes_%s' % split_name >> beam.Map( functools.partial(filter_invalid_notes, min_pitch, max_pitch)) s |= 'extract_examples_%s' % split_name >> beam.ParDo( ExtractExamplesDoFn( min_hop, max_hop, num_replications if split_name == 'train' else 1, encode_performance_fn, encode_score_fns, augment_fns if split_name == 'train' else None, absolute_timing, random_crop_length)) s |= 'shuffle_%s' % split_name >> beam.Reshuffle() s |= 'write_%s' % split_name >> beam.io.WriteToTFRecord( output_filename, coder=beam.coders.ProtoCoder(tf.train.Example))
import apache_beam as beam import re inputs_pattern = 'SalesJan2009.csv' outputs_prefix = 'outputs/part' class SplitWords(beam.DoFn): def __init__(self, header): self.header = header def process(self, text): yield text # Running locally in the DirectRunner. header = "Date,Product,Price,Card,Country" with beam.Pipeline() as pipeline: (pipeline | 'Read lines' >> beam.io.ReadFromText(inputs_pattern) | 'Par Do' >> beam.ParDo(SplitWords(header)) | 'SUm the stuff' >> beam.combiners.Count.PerElement() #| 'Find words' >> beam.FlatMap(lambda line: re.split(",", line)) | 'Format results' >> beam.Map(print) #| 'Write results' >> beam.io.WriteToText(outputs_prefix) )
def create_pipeline(self): return beam.Pipeline(runner=fn_api_runner.FnApiRunner())
def test_native_source(self): with beam.Pipeline(argv=self.args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource(query=self.query, use_standard_sql=True))) assert_that(result, equal_to(self.get_expected_data()))
def create_pipeline(self): return beam.Pipeline( runner=fn_api_runner.FnApiRunner( default_environment=beam_runner_api_pb2.Environment( urn=python_urns.EMBEDDED_PYTHON_GRPC, payload=b'2')))
def testEvaluateWithSlicingAndUncertainty(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor([ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['slice_key']) ]) ] for batch_size in [1, 2, 4, 8]: with beam.Pipeline() as pipeline: example1 = self._makeExample( age=3.0, language='english', label=1.0, slice_key='first_slice') example2 = self._makeExample( age=3.0, language='chinese', label=0.0, slice_key='first_slice') example3 = self._makeExample( age=4.0, language='english', label=0.0, slice_key='second_slice') example4 = self._makeExample( age=5.0, language='chinese', label=1.0, slice_key='second_slice') example5 = self._makeExample( age=5.0, language='chinese', label=1.0, slice_key='second_slice') (metrics, _), _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString(), example5.SerializeToString(), ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model, desired_batch_size=batch_size, compute_confidence_intervals=True)) def check_result(got): try: self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'),) second_slice = (('slice_key', b'second_slice'),) self.assertCountEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsWithTDistributionAlmostEqual( slices[overall_slice], { 'accuracy': 0.4, 'label/mean': 0.6, 'my_mean_age': 4.0, 'my_mean_age_times_label': 2.6, 'added_example_count': 5.0 }) self.assertDictElementsWithTDistributionAlmostEqual( slices[first_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.0, 'my_mean_age_times_label': 1.5, 'added_example_count': 2.0 }) self.assertDictElementsWithTDistributionAlmostEqual( slices[second_slice], { 'accuracy': 0.0, 'label/mean': 2.0 / 3.0, 'my_mean_age': 14.0 / 3.0, 'my_mean_age_times_label': 10.0 / 3.0, 'added_example_count': 3.0 }) except AssertionError as err: # This function is redefined every iteration, so it will have the # right value of batch_size. raise util.BeamAssertException('batch_size = %d, error: %s' % (batch_size, err)) # pylint: disable=cell-var-from-loop util.assert_that(metrics, check_result, label='metrics')
def create_pipeline(self): return beam.Pipeline( runner=fn_api_runner.FnApiRunner(bundle_repeat=3))
def testEvaluateWithPlots(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.auc_plots() ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=0.7, label=0.0) example3 = self._makeExample(prediction=0.8, label=1.0) example4 = self._makeExample(prediction=1.0, label=1.0) (metrics, plots), _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator .ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.EXAMPLE_COUNT: 4.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_metrics, label='metrics') def check_plots(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictMatrixRowsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.AUC_PLOTS_MATRICES: [ (8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0]) ], }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(plots, check_plots, label='plots')
import apache_beam as beam import argparse from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io import ReadFromText from apache_beam.io import WriteToText class AppendDoFn(beam.DoFn): def process(self, element): return element + " - Hello World!" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt') parser.add_argument( '--output', dest='output', default='gs://dsp_model_store_famenor/shakespeare/kinglear.txt') known_args, pipeline_args = parser.parse_known_args(None) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) lines = p | 'read' >> ReadFromText(known_args.input) appended = lines | 'append' >> beam.ParDo(AppendDoFn()) appended | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish()
# run pipeline on Dataflow options = { 'runner': 'DataflowRunner', 'job_name': 'nomination-count-10', 'project': PROJECT_ID, 'temp_location': BUCKET + '/temp', 'staging_location': BUCKET + '/staging', 'machine_type': 'n1-standard-1', # machine types listed here: https://cloud.google.com/compute/docs/machine-types 'num_workers': 1 } opts = PipelineOptions(flags=[], **options) with beam.Pipeline('DataflowRunner', options=opts) as p: # create PCollection from the file contents in_pcoll = p | 'Read File' >> ReadFromText(DIR_PATH_IN + 'oscars_data.tsv') # apply a ParDo to the PCollection out_pcoll = in_pcoll | 'Extract Actor and Actress' >> beam.ParDo( ActorActressCountFn()).with_outputs( ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT, ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT) actor_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT] actress_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT] # write PCollections to files actor_pcoll | 'Write Actor File 1' >> WriteToText(DIR_PATH_OUT +
def pipeline(config_map, dataset_config_map, preprocess_example_fn, input_tensors_to_example_fn): """Pipeline for dataset creation.""" tf.flags.mark_flags_as_required(['output_directory']) pipeline_options = beam.options.pipeline_options.PipelineOptions( FLAGS.pipeline_options.split(',')) config = config_map[FLAGS.config] hparams = config.hparams hparams.parse(FLAGS.hparams) datasets = dataset_config_map[FLAGS.dataset_config] if tf.gfile.Exists(FLAGS.output_directory): raise ValueError('Output directory %s already exists!' % FLAGS.output_directory) tf.gfile.MakeDirs(FLAGS.output_directory) with tf.gfile.Open(os.path.join(FLAGS.output_directory, 'config.txt'), 'w') as f: f.write('\n\n'.join([ 'min_length: {}'.format(FLAGS.min_length), 'max_length: {}'.format(FLAGS.max_length), 'sample_rate: {}'.format(FLAGS.sample_rate), 'preprocess_examples: {}'.format(FLAGS.preprocess_examples), 'preprocess_train_example_multiplier: {}'.format( FLAGS.preprocess_train_example_multiplier), 'config: {}'.format(FLAGS.config), 'hparams: {}'.format(hparams.to_json(sort_keys=True)), 'dataset_config: {}'.format(FLAGS.dataset_config), 'datasets: {}'.format(datasets), ])) with beam.Pipeline(options=pipeline_options) as p: for dataset in datasets: if isinstance(dataset.path, (list, tuple)): # If dataset.path is a list, then it's a list of sources to mix together # to form new examples. First, do the mixing, then pass the results to # the rest of the pipeline. id_exs = [] sourceid_to_exids = [] for source_id, stem_path in enumerate(dataset.path): if dataset.num_mixes is None: raise ValueError( 'If path is not a list, num_mixes must not be None: {}' .format(dataset)) stem_p = p | 'tfrecord_list_%s_%d' % ( dataset.name, source_id) >> (beam.Create( data.generate_sharded_filenames(stem_path))) # Note that we do not specify a coder when reading here. # This is so that the hashing in key_example below can work directly # on the serialized version instead of having to re-serialize it. # Also, deserializing with a coder and then re-serializing does not # always generate the same hash for the same example (likely due to # the map fields in tf.train.Example). This is important when reading # the same dataset multiple times to mix it with itself. stem_p |= 'read_tfrecord_%s_%d' % ( dataset.name, source_id) >> ( beam.io.tfrecordio.ReadAllFromTFRecord()) stem_p |= 'shuffle_stems_%s_%d' % ( dataset.name, source_id) >> (beam.Reshuffle()) # Key all examples with a hash. def key_example(ex): return (hashlib.sha256(ex).hexdigest(), ex) stem_p |= 'add_id_key_%s_%d' % ( dataset.name, source_id) >> (beam.Map(key_example)) id_exs.append(stem_p) # Create a list of source_id to example id. def sourceid_to_exid(id_ex, source_id): return (source_id, id_ex[0]) sourceid_to_exids.append( stem_p | 'key_%s_%d' % (dataset.name, source_id) >> (beam.Map(sourceid_to_exid, source_id=source_id))) # ('example_hash', serialized_example) id_exs = ( id_exs | 'id_exs_flatten_%s' % dataset.name >> beam.Flatten() | 'id_exs_distinct_%s' % dataset.name >> beam.Distinct()) # ('source_id, 'example_hash') sourceid_to_exids = (sourceid_to_exids | 'sourceid_to_exids_flatten_%s' % dataset.name >> beam.Flatten()) # Pass the list of source id to example IDs to generate_mixes, # which will create mixes by selecting random IDs from each source # (with replacement). This is represented as a list of example IDs # to Mix IDs. # Note: beam.Create([0]) is just a single dummy value to allow the # sourceid_to_exids to be passed in as a python list so we can do the # sampling with numpy. exid_to_mixids = ( p | 'create_dummy_%s' % dataset.name >> beam.Create([0]) | 'generate_mixes_%s' % dataset.name >> beam.Map( create_dataset_lib.generate_mixes, num_mixes=dataset.num_mixes, sourceid_to_exids=beam.pvalue.AsList( sourceid_to_exids))) # Create a list of (Mix ID, Full Example proto). Note: Examples may be # present in more than one mix. Then, group by Mix ID. def mixid_to_exs(id_ex, exid_to_mixids): exid, ex = id_ex for mixid in exid_to_mixids[exid]: yield mixid, ex mixid_exs = ( id_exs | 'mixid_to_exs_%s' % dataset.name >> beam.FlatMap( mixid_to_exs, exid_to_mixids=beam.pvalue.AsSingleton(exid_to_mixids)) | 'group_by_key_%s' % dataset.name >> beam.GroupByKey()) # Take these groups of Examples, mix their audio and sequences to return # a single new Example. Then, carry on with the rest of the pipeline # like normal. split_p = (mixid_exs | 'mix_examples_%s' % dataset.name >> beam.Map( mix_examples, FLAGS.sample_rate, FLAGS.load_audio_with_librosa)) else: if dataset.num_mixes is not None: raise ValueError( 'If path is not a list, num_mixes must be None: {}'. format(dataset)) split_p = p | 'tfrecord_list_%s' % dataset.name >> beam.Create( data.generate_sharded_filenames(dataset.path)) split_p |= 'read_tfrecord_%s' % dataset.name >> ( beam.io.tfrecordio.ReadAllFromTFRecord( coder=beam.coders.ProtoCoder(tf.train.Example))) split_p |= 'shuffle_input_%s' % dataset.name >> beam.Reshuffle() split_p |= 'split_wav_%s' % dataset.name >> beam.FlatMap( split_wav, min_length=FLAGS.min_length, max_length=FLAGS.max_length, sample_rate=FLAGS.sample_rate, debug_output_directory=FLAGS.output_directory, split_example=dataset.process_for_training, load_audio_with_librosa=FLAGS.load_audio_with_librosa) if FLAGS.preprocess_examples: if dataset.process_for_training: mul_name = 'preprocess_multiply_%dx_%s' % ( FLAGS.preprocess_train_example_multiplier, dataset.name) split_p |= mul_name >> beam.FlatMap( multiply_example, FLAGS.preprocess_train_example_multiplier) split_p |= 'preprocess_%s' % dataset.name >> beam.Map( preprocess_data, preprocess_example_fn, input_tensors_to_example_fn, hparams, dataset.process_for_training) split_p |= 'shuffle_output_%s' % dataset.name >> beam.Reshuffle() split_p |= 'write_%s' % dataset.name >> beam.io.WriteToTFRecord( os.path.join(FLAGS.output_directory, '%s.tfrecord' % dataset.name), coder=beam.coders.ProtoCoder(tf.train.Example))