def testTFlitePredictExtractorWithKerasModel(self, multi_model,
                                                 multi_output):
        input1 = tf.keras.layers.Input(shape=(1, ), name='input1')
        input2 = tf.keras.layers.Input(shape=(1, ), name='input2')
        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)
        output_layers = {}
        output_layers['output1'] = (tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid, name='output1')(input_layer))
        if multi_output:
            output_layers['output2'] = (tf.keras.layers.Dense(
                1, activation=tf.nn.sigmoid, name='output2')(input_layer))

        model = tf.keras.models.Model(inputs, output_layers)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        train_features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]}
        labels = {'output1': [[1], [0]]}
        if multi_output:
            labels['output2'] = [[1], [0]]

        example_weights = {'output1': [1.0, 0.5]}
        if multi_output:
            example_weights['output2'] = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (train_features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        converter = tf.compat.v2.lite.TFLiteConverter.from_keras_model(model)
        tflite_model = converter.convert()

        tflite_model_dir = tempfile.mkdtemp()
        with tf.io.gfile.GFile(os.path.join(tflite_model_dir, 'tflite'),
                               'wb') as f:
            f.write(tflite_model)

        model_specs = [
            config_pb2.ModelSpec(name='model1', model_type='tf_lite')
        ]
        if multi_model:
            model_specs.append(
                config_pb2.ModelSpec(name='model2', model_type='tf_lite'))

        eval_config = config_pb2.EvalConfig(model_specs=model_specs)
        eval_shared_models = [
            self.createTestEvalSharedModel(
                model_name='model1',
                eval_saved_model_path=tflite_model_dir,
                model_type='tf_lite')
        ]
        if multi_model:
            eval_shared_models.append(
                self.createTestEvalSharedModel(
                    model_name='model2',
                    eval_saved_model_path=tflite_model_dir,
                    model_type='tf_lite'))

        schema = text_format.Parse(
            """
        feature {
          name: "input1"
          type: FLOAT
        }
        feature {
          name: "non_model_feature"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        predictor = tflite_predict_extractor.TFLitePredictExtractor(
            eval_config=eval_config, eval_shared_model=eval_shared_models)

        examples = [
            self._makeExample(input1=0.0, non_model_feature=0),
            self._makeExample(input1=1.0, non_model_feature=1),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | predictor.stage_name >> predictor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got = got[0]
                    self.assertIn(constants.PREDICTIONS_KEY, got)
                    self.assertLen(got[constants.PREDICTIONS_KEY], 2)

                    for item in got[constants.PREDICTIONS_KEY]:
                        if multi_model:
                            self.assertIn('model1', item)
                            self.assertIn('model2', item)
                            if multi_output:
                                self.assertIn('Identity', item['model1'])
                                self.assertIn('Identity_1', item['model1'])

                        elif multi_output:
                            self.assertIn('Identity', item)
                            self.assertIn('Identity_1', item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

                util.assert_that(result, check_result, label='result')
  def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir,
        add_metrics_callbacks=[
            _addExampleCountMetricCallback,
            # Note that since everything runs in-process this doesn't
            # actually test that the py_func can be correctly recreated
            # on workers in a distributed context.
            _addPyFuncMetricCallback,
            post_export_metrics.example_count(),
            post_export_metrics.example_weight(example_weight_key='age')
        ])
    extractors = [
        predict_extractor.PredictExtractor(eval_shared_model),
        slice_key_extractor.SliceKeyExtractor()
    ]

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(age=3.0, language='english', label=1.0)
      example2 = self._makeExample(age=3.0, language='chinese', label=0.0)
      example3 = self._makeExample(age=4.0, language='english', label=1.0)
      example4 = self._makeExample(age=5.0, language='chinese', label=0.0)

      (metrics, plots), _ = (
          pipeline
          | 'Create' >> beam.Create([
              example1.SerializeToString(),
              example2.SerializeToString(),
              example3.SerializeToString(),
              example4.SerializeToString()
          ])
          | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
          | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
          | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator
          .ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

      def check_result(got):
        try:
          self.assertEqual(1, len(got), 'got: %s' % got)
          (slice_key, value) = got[0]
          self.assertEqual((), slice_key)
          self.assertDictElementsAlmostEqual(
              got_values_dict=value,
              expected_values_dict={
                  'accuracy': 1.0,
                  'label/mean': 0.5,
                  'my_mean_age': 3.75,
                  'my_mean_age_times_label': 1.75,
                  'added_example_count': 4.0,
                  'py_func_label_sum': 2.0,
                  metric_keys.EXAMPLE_COUNT: 4.0,
                  metric_keys.EXAMPLE_WEIGHT: 15.0
              })
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(metrics, check_result, label='metrics')
      util.assert_that(plots, util.is_empty(), label='plots')
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', help='Input file to process.')
    parser.add_argument('--output-file',
                        dest='output_file',
                        help='Output file to write results to.')
    parser.add_argument('--output-topic',
                        dest='output_topic',
                        help='Output topic to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner=DirectRunner',
        '--project=global-datacenter',
        #      '--staging_location=/tmp/beam/staging',
        #      '--temp_location=/tmp/beam/tmp',
        '--job_name=parse-twitter-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        def as_feed(top):
            return json.dumps({
                "version":
                "https://jsonfeed.org/version/1",
                "title":
                "Trending Twitter Keywords",
                "home_page_url":
                "https://example.org/",
                "feed_url":
                "https://example.org/feed.json",
                "items": [
                    {
                        "id": row[0],
                        "content_text": f"Keyword '{row[0]}' counted {row[1]}",
                        "url":
                        f"https://twitter.com/search?q={row[0]}"  # TODO security: urlencode keyword
                    } for row in top
                ]
            })

        texts = (p
                 | 'Read' >> ReadFromText(known_args.input)
                 | 'FromJSON' >> beam.Map(json.loads)
                 | 'GetTexts' >> beam.Map(lambda x: x['data']['text']))

        feed = (
            texts
            | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                r'[@#\w\']{6,}', x, re.UNICODE)).with_output_types(unicode))
            | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
            | 'GroupAndSum' >> beam.CombinePerKey(sum)
            |
            'Top10' >> beam.transforms.combiners.Top.Of(10, key=lambda x: x[1])
            | 'AsFeed' >> beam.Map(as_feed))

        if known_args.output_file:
            unused = (feed | WriteToText(known_args.output_file))

        if known_args.output_topic:
            unused = (
                feed
                | 'Encode' >>
                beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)
                | 'Publish' >> beam.io.WriteToPubSub(known_args.output_topic))
Esempio n. 4
0
 def test_test_transform(self):
     with beam.Pipeline() as p:
         assert_that(
             p | beam.Create(['a', 'b', 'c']) | _TestTransform('x', 'y'),
             equal_to(['xay', 'xby', 'xcy']))
 def test_iobase_source(self):
     with beam.Pipeline(argv=self.args) as p:
         result = (p | 'read' >> beam.io.ReadFromBigQuery(
             query=self.query, use_standard_sql=True, project=self.project))
         assert_that(result, equal_to(self.get_expected_data()))
# coding=utf-8
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions


class MyOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
        parser.add_argument(
            '--output',
            default='./output.txt',
            help='Output for the pipeline')


if __name__ == '__main__':
    options = MyOptions()
    options.view_as(beam.options.pipeline_options.StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    (p
     | 'create numbers' >> beam.Create([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
     | 'combine numbers' >> beam.CombineGlobally(sum)
     | 'write to text' >> beam.io.WriteToText(options.output, shard_name_template=""))

    p.run().wait_until_finish()
Esempio n. 7
0
File: ltv.py Progetto: mozafrank/ltv
def run(argv=None):
  
  """The main function which creates the pipeline and runs it."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--model_identifier', required=True,
                        help='Used to identify LTV model and append to table and file names',
                        default='by_sample_id')		#filter-campaign	
  parser.add_argument('--stage-bucket', dest='stage_bucket', required=False, help='Staging bucket to use', default='ltv-dataflow')
  parser.add_argument('--data-bucket', dest='data_bucket', required=False, help='Data bucket to use', default='telemetry-to-gcp')

  parser.add_argument('--load_bq', required=False,
                        help='True/False to load summary/details to bq',
                        default=False)			
  parser.add_argument('--estimate_model', required=False,
                        help='True/False to estimate model params',
                        default=False)
  parser.add_argument('--calculate_model', required=False,
                        help='True/False to calculate ltv',
                        default=False)
  parser.add_argument('--calculate_stats', required=False,
                        help='True/False to calculate stats',
                        default=False)
  parser.add_argument('--upload_stats', required=False,
                        help='True/False to calculate stats',
                        default=False) # to be merged in calc stats once i figure out wild card stuff
  parser.add_argument('--send_output', required=False,
                        help='True/False to send ltv and aggr files to Marketing GCP',
                        default=False)
  parser.add_argument('--delete_data', required=False,
                        help='True/False to delete input data and BQ data',
                        default=False)
  
  start = time.clock()
  
  # Parse arguments from the command line.
  known_args, pipeline_args = parser.parse_known_args(argv)
  logging.info('running beam_calc for model: ' + known_args.model_identifier)

  # read in the output bg table schema
  bg_out_schema= ''
  #schema_file =  "gs://ltv-dataflow-dev/templates/input/calc_output_schema.json"
  schema_file = 'gs://{}/templates/input/calc_output_schema.json'.format(known_args.stage_bucket)
  with gcs.open(schema_file) as f:
    data = f.read()
    # Wrapping the schema in fields is required for the BigQuery API.
    bg_out_schema = '{"fields": ' + data + '}'
  schema = parse_table_schema_from_json(bg_out_schema)
  #schema = bigquery_tools.parse_table_schema_from_json(bg_out_schema)
  #logging.info(schema)
  #ltv_beam.py:306: BeamDeprecationWarning: parse_table_schema_from_json is deprecated since 2.11.0. Use bigquery_tools.parse_table_schema_from_json instead.
  
  # estimate LTV model
  min_sample_size = calculate_min_sample_size()
  #estimate_model(min_sample_size)
  
  pipeline_options = PipelineOptions(pipeline_args)
  #pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))  
  
  # to be replaced with direct telem bq table to dataflow input?
  input_file_dir = 'gs://' + known_args.data_bucket + '/clv/' + known_args.model_identifier.replace('_','-') + '/{0}/*.parquet'

  # load parquet files into bq, overwrite (may not need if we can join datasets in Telemetry? hmmm, no, we need to get random selection)
  # use dummy singleton DoFn if no Parquet reader
  # much faster to directly load summary and details to bq - need in bq for quick join (not sure how fast join is as a pcollection)
  if known_args.load_bq:
    logging.info('loading bq')
    load_data_bq('ltv', 'summary_' + known_args.model_identifier, input_file_dir.format('summary'))
    load_data_bq('ltv', 'details_' + known_args.model_identifier, input_file_dir.format('details'))

  # run this to estimate LTV model parameters
  if known_args.estimate_model:
    logging.info('estimate model parameters'+known_args.estimate_model)
    serial_dummy = p | 'Read' >> beam.Create( ['serial_dummy'] ) | 'Estimate Lifetimes Model' >> beam.ParDo(ltv_calculate.EstimateLTVFn(min_sample_size,output_bucket='gs://ltv-dataflow-dev/tmp/',model_tag=known_args.model_identifier))
    p.run().wait_until_finish() # fun estimation first before moving on to calculation

  # this has to be run on dataflow or will not upload to bq
  if known_args.calculate_model:
    logging.info('calculate ltv')
    data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier) # + " ORDER BY RAND() LIMIT {}").format(100)
    ##data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier + " ORDER BY RAND() LIMIT {}").format(100)
    (p
    | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=data_query, use_standard_sql=True))
    | 'Apply Lifetimes Model' >> beam.ParDo( ltv_calculate.CalcLTVFn( dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/bgf_" + known_args.model_identifier + ".pkl", 'rb')), dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/ggf_" + known_args.model_identifier + ".pkl", 'rb')) ) )
    | 'Write Data to BigQuery' >> beam.io.WriteToBigQuery("ltv.calc_" + known_args.model_identifier, schema=schema,create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    )
    p.run().wait_until_finish()
    
  # calc aggregate statistics
  if known_args.calculate_stats:
    logging.info('aggregate ltv')
    qry_outlier_stats = ("""SELECT count(distinct(client_id)) ct, avg(historical_searches) avg, stddev(historical_searches) std 
  						    FROM ltv.summary_{0} 
  						    WHERE client_id not in (SELECT client_id FROM ltv.details_{0} WHERE default_search_engine='google-nocodes')""").format(known_args.model_identifier)
    query_job = bq_client.query(qry_outlier_stats)
    outlier_stats = query_job.to_dataframe() # no need to go through query_job.result()
    #logging.info(outlier_stats.head(1))

    ct = outlier_stats['ct'][0]

    if ct > 0: 
      mu = outlier_stats['avg'][0]
      sigma = outlier_stats['std'][0]
      outliers_upper = str(mu + 2.5 * sigma)
      outliers_lower = str(mu - 2.5 * sigma)
    
    # anyway to specify computation/mem instensive machine here?    
    if known_args.model_identifier=='filter_campaign':
      sid_list = ['']
    else:
      sid_list = ['10','33','53','89']      
      
    for sid in sid_list:
      if known_args.model_identifier=='filter_campaign':
        sid_qry = ''
        assert sid == '' 
      else:
        sid_qry = 'AND l.sample_id='+sid
      
      (p | 'Read_Age'+sid >> beam.Create( ['another_serial_dummy_age'+sid] )  | 'AggrCustomerAgeFn'+sid >> beam.ParDo(ltv_aggregate.AggrCustomerAgeFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_E10'+sid >> beam.Create( ['another_serial_dummy_e10'+sid] )  | 'AggrE10ActivityGroupLocaleFn'+sid >> beam.ParDo(ltv_aggregate.AggrE10ActivityGroupLocaleFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))   
      (p | 'Read_Geo'+sid >> beam.Create( ['another_serial_dummy_geo'+sid] )  | 'AggrGlobalGeoUserStatusFn'+sid >> beam.ParDo(ltv_aggregate.AggrGlobalGeoUserStatusFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_City'+sid >> beam.Create( ['another_serial_dummy_city'+sid] )  | 'AggrCityFn'+sid >> beam.ParDo(ltv_aggregate.AggrCityFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_ECB'+sid >> beam.Create( ['another_serial_dummy_ecb'+sid] )  | 'AggrEngineChannelBrowserFn'+sid >> beam.ParDo(ltv_aggregate.AggrEngineChannelBrowserFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Mem'+sid >> beam.Create( ['another_serial_dummy_mem'+sid] )  | 'AggrMemoryFn'+sid >> beam.ParDo(ltv_aggregate.AggrMemoryFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Attr1'+sid >> beam.Create( ['another_serial_dummy_attr1'+sid] )  | 'AggrAttributes1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Attr2'+sid >> beam.Create( ['another_serial_dummy_attr2'+sid] )  | 'AggrAttributes2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Attr3'+sid >> beam.Create( ['another_serial_dummy_attr3'+sid] )  | 'AggrAttributes3Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes3Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_OS1'+sid >> beam.Create( ['another_serial_dummy_os1'+sid] )  | 'AggrOS1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_OS2'+sid >> beam.Create( ['another_serial_dummy_os2'+sid] )  | 'AggrOS2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_SCD'+sid >> beam.Create( ['another_serial_dummy_scd'+sid] )  | 'AggrSyncConfiguredDesktopFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredDesktopFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_SCM'+sid >> beam.Create( ['another_serial_dummy_scm'+sid] )  | 'AggrSyncConfiguredMobileFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredMobileFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_BC'+sid >> beam.Create( ['another_serial_dummy_bc'+sid] )  | 'AggrBookmarksCountFn'+sid >> beam.ParDo(ltv_aggregate.AggrBookmarksCountFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))

    # compute ltv statistics
    # quantile compute https://stackoverflow.com/questions/46827512/efficient-algorithm-for-computing-quantiles-in-terabytes-dataset
    p.run().wait_until_finish()

  if known_args.upload_stats:
    # delete anything from bq aggr table before running aggr stats
    qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    # load any files in ltv-dataflow/tmp containing self.model_tag; and delete them
    dataset_ref = bq_client.dataset('ltv')
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.skip_leading_rows = 1
    # The source format defaults to CSV, so the line below is optional.
    job_config.source_format = bigquery.SourceFormat.CSV
    uri = "gs://" + known_args.stage_bucket + "/tmp/" + known_args.model_identifier + '' + "/aggr_*.csv"
    try:
      load_job = bq_client.load_table_from_uri(uri, dataset_ref.table('aggr_'+known_args.model_identifier), job_config=job_config)  # API request
      print("Starting job {}".format(load_job.job_id))
      load_job.result()  # Waits for table load to complete.
      print("Job finished.")
      destination_table = bq_client.get_table(dataset_ref.table('aggr_'+known_args.model_identifier))
      print("Loaded {} rows.".format(destination_table.num_rows))
    except ImportError:
      logging.info('uri: ' + uri)
    
  # push files to Marketing GCP in a GCF?
  # output parquet files
  if known_args.send_output:
    # test dataflow write parquet files --- SUCCESSFUL! we can output files in parquet format
    calc_query = ("SELECT client_id, sample_id FROM ltv.summary_{} LIMIT 10").format(known_args.model_identifier)
    #data_query = ("SELECT summ.*, det.* EXCEPT (client_id) FROM ltv.summary summ LEFT JOIN ltv.details det ON summ.client_id=det.client_id WHERE summ.client_id in ('3691929b0e07e22c86c1167c83ded58f481caf89','64ba414c3820805b1f64021cf3e082b091dec4f4')")
    # ugh need to differentiate by sample id
    sid = ''
    calc_fn = 'gs://' + known_args.stage_bucket + '/output/ltv_calc_' + known_args.model_identifier + sid
    (p
     | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=calc_query, use_standard_sql=True))  
     | 'Write Data to Parquet' >> beam.io.WriteToParquet(calc_fn, pa.schema([('client_id', pa.string()), ('sample_id', pa.int32())]))
    )
    
    # https://arrow.apache.org/docs/python/data.html#type-metadata
    p.run().wait_until_finish()
    
    # if file exists, push(move) to marketing, else throw error
    
  
  # if all files are in output dir, then clean up bq and put _SUCCESS in dir which should trigger gcf
  # delete intermediary data from bq (and gcs?)
  if known_args.send_output:
    # delete anything in bq ltv tables
    qry_truncate = ("DELETE from ltv.summary_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    qry_truncate = ("DELETE from ltv.details_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    qry_truncate = ("DELETE from ltv.calc_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    # delete gcs data files?
  
  logging.info('ltv.run() runtime: ' + str(time.clock() - start))
def run(argv=None, save_main_session=True):
  """Main entry point to pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--corpus_home',
                      dest='corpus_home',
                      help='The directory or bucke of the corpus home')
  parser.add_argument('--input',
                      dest='input',
                      help='A single input file')
  parser.add_argument('--corpus_prefix',
                      dest='corpus_prefix',
                      help='Prefix after corpus home where the files are')
  parser.add_argument('--ignorelines',
                      dest='ignorelines',
                      help='Ignore lines containing these words')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file')
  known_args, pipeline_args = parser.parse_known_args(argv)
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)
  ignorepatterns = []
  if known_args.ignorelines:
    ignorepatterns = load_ignore(known_args.ignorelines)
  if known_args.corpus_home:
    logging.info('corpus_home: %s', known_args.corpus_home)
    corpus_data_dir = '{}/data/corpus'.format(known_args.corpus_home)
    corpus_index = '{}/collections.csv'.format(corpus_data_dir)
    corpus_dir = known_args.corpus_home
    if known_args.corpus_prefix:
      corpus_dir = '{}/{}'.format(known_args.corpus_home,
                                  known_args.corpus_prefix)
    lines = (p | 'read_top_index' >> ReadFromText(corpus_index)
              | 'split_top_index' >> beam.ParDo(ExtractIndexEntry())
              | 'add_prefix_corpus_data' >> beam.FlatMap(add_prefix,
                                                         corpus_data_dir)
              | 'read_secondary_index' >> ReadAllFromText()
              | 'split_secondary_index' >> beam.ParDo(ExtractIndexEntry())
              | 'add_prefix_corpus_dir' >> beam.FlatMap(add_prefix, corpus_dir)
              | 'read_files' >> ReadAllFromText())
  else:
    lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each character.
  def count_ones(char_ones):
    (c, ones) = char_ones
    return (c, sum(ones))

  # Ignore counts for lines that are boilerplate (copyright notices, etc)
  re_patterns = []
  for val in ignorepatterns:
    pat = '.*{}.*'.format(val)
    re_patterns.append(re.compile(pat, re.IGNORECASE))

  def not_boilerplate(line):
    """true if the line does not match a boilerplate pattern """
    for re_pattern in re_patterns:
      if re_pattern.match(line) != None:
        return False
    return True

  counts = (lines
            | 'filter' >> beam.Filter(not_boilerplate)
            | 'split' >> (beam.ParDo(CharBigramExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(count_ones))

  # Format the result
  def format_result(char_bigram_count):
    (char_bigram, count) = char_bigram_count
    return '%s\t%d' % (char_bigram, count)

  output = counts | 'format' >> beam.Map(format_result)

  output | 'write' >> WriteToText(known_args.output)
  result = p.run()
  result.wait_until_finish()
  if (not hasattr(result, 'has_job') or result.has_job):
    char_bigram_filter = MetricsFilter().with_name('char_bigrams')
    query_result = result.metrics().query(char_bigram_filter)
    if query_result['counters']:
      char_bigram_counter = query_result['counters'][0]
      logging.info('Total char bigrams: %d', char_bigram_counter.result)
Esempio n. 9
0
 def test_source_transform(self):
     path = self._write_data()
     with beam.Pipeline('DirectRunner') as p:
         assert_that(p | avroio.ReadFromAvro(path), equal_to(self.RECORDS))
Esempio n. 10
0
def model_custom_source(count):
    """Demonstrates creating a new custom source and using it in a pipeline.

  Defines a new source ``CountingSource`` that produces integers starting from 0
  up to a given size.

  Uses the new source in an example pipeline.

  Additionally demonstrates how a source should be implemented using a
  ``PTransform``. This is the recommended way to develop sources that are to
  distributed to a large number of end users.

  This method runs two pipelines.

  (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``CountingSource``.

  Args:
    count: the size of the counting source to be used in the pipeline
           demonstrated in this method.

  """

    # Using the source in an example pipeline.
    # [START model_custom_source_use_new_source]
    with beam.Pipeline(options=PipelineOptions()) as p:
        numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count))
        # [END model_custom_source_use_new_source]

        lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
        assert_that(
            lines,
            equal_to(['line ' + str(number) for number in range(0, count)]))

    # We recommend users to start Source classes with an underscore to discourage
    # using the Source class directly when a PTransform for the source is
    # available. We simulate that here by simply extending the previous Source
    # class.
    class _CountingSource(CountingSource):
        pass

    # [START model_custom_source_new_ptransform]
    class ReadFromCountingSource(PTransform):
        def __init__(self, count, **kwargs):
            super(ReadFromCountingSource, self).__init__(**kwargs)
            self._count = count

        def expand(self, pcoll):
            return pcoll | iobase.Read(_CountingSource(count))

    # [END model_custom_source_new_ptransform]

    # [START model_custom_source_use_ptransform]
    p = beam.Pipeline(options=PipelineOptions())
    numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count)
    # [END model_custom_source_use_ptransform]

    lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
    assert_that(
        lines, equal_to(['line ' + str(number) for number in range(0, count)]))

    # Don't test runner api due to pickling errors.
    p.run(test_runner_api=False).wait_until_finish()
Esempio n. 11
0
def model_custom_sink(simplekv, KVs, final_table_name_no_ptransform,
                      final_table_name_with_ptransform):
    """Demonstrates creating a new custom sink and using it in a pipeline.

  Defines a new sink ``SimpleKVSink`` that demonstrates writing to a simple
  key-value based storage system which has following API.

    simplekv.connect(url) -
        connects to the storage system and returns an access token which can be
        used to perform further operations
    simplekv.open_table(access_token, table_name) -
        creates a table named 'table_name'. Returns a table object.
    simplekv.write_to_table(access_token, table, key, value) -
        writes a key-value pair to the given table.
    simplekv.rename_table(access_token, old_name, new_name) -
        renames the table named 'old_name' to 'new_name'.

  Uses the new sink in an example pipeline.

  Additionally demonstrates how a sink should be implemented using a
  ``PTransform``. This is the recommended way to develop sinks that are to be
  distributed to a large number of end users.

  This method runs two pipelines.

  (1) A pipeline that uses ``SimpleKVSink`` directly using the ``df.Write``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``SimpleKVSink``.

  Args:
    simplekv: an object that mocks the key-value storage.

    KVs: the set of key-value pairs to be written in the example pipeline.

    final_table_name_no_ptransform: the prefix of final set of tables to be
                                    created by the example pipeline that uses
                                    ``SimpleKVSink`` directly.

    final_table_name_with_ptransform: the prefix of final set of tables to be
                                      created by the example pipeline that uses
                                      a ``PTransform`` that wraps
                                      ``SimpleKVSink``.
  """

    import apache_beam as beam
    from apache_beam.io import iobase
    from apache_beam.transforms.core import PTransform
    from apache_beam.options.pipeline_options import PipelineOptions

    # Defining the new sink.
    # [START model_custom_sink_new_sink]
    class SimpleKVSink(iobase.Sink):
        def __init__(self, url, final_table_name):
            self._url = url
            self._final_table_name = final_table_name

        def initialize_write(self):
            access_token = simplekv.connect(self._url)
            return access_token

        def open_writer(self, access_token, uid):
            table_name = 'table' + uid
            return SimpleKVWriter(access_token, table_name)

        def finalize_write(self, access_token, table_names):
            for i, table_name in enumerate(table_names):
                simplekv.rename_table(access_token, table_name,
                                      self._final_table_name + str(i))

    # [END model_custom_sink_new_sink]

    # Defining a writer for the new sink.
    # [START model_custom_sink_new_writer]
    class SimpleKVWriter(iobase.Writer):
        def __init__(self, access_token, table_name):
            self._access_token = access_token
            self._table_name = table_name
            self._table = simplekv.open_table(access_token, table_name)

        def write(self, record):
            key, value = record

            simplekv.write_to_table(self._access_token, self._table, key,
                                    value)

        def close(self):
            return self._table_name

    # [END model_custom_sink_new_writer]

    final_table_name = final_table_name_no_ptransform

    # Using the new sink in an example pipeline.
    # [START model_custom_sink_use_new_sink]
    with beam.Pipeline(options=PipelineOptions()) as p:
        kvs = p | 'CreateKVs' >> beam.Create(KVs)

        kvs | 'WriteToSimpleKV' >> beam.io.Write(
            SimpleKVSink('http://url_to_simple_kv/', final_table_name))
        # [END model_custom_sink_use_new_sink]

    # We recommend users to start Sink class names with an underscore to
    # discourage using the Sink class directly when a PTransform for the sink is
    # available. We simulate that here by simply extending the previous Sink
    # class.
    class _SimpleKVSink(SimpleKVSink):
        pass

    # [START model_custom_sink_new_ptransform]
    class WriteToKVSink(PTransform):
        def __init__(self, url, final_table_name, **kwargs):
            super(WriteToKVSink, self).__init__(**kwargs)
            self._url = url
            self._final_table_name = final_table_name

        def expand(self, pcoll):
            return pcoll | iobase.Write(
                _SimpleKVSink(self._url, self._final_table_name))

    # [END model_custom_sink_new_ptransform]

    final_table_name = final_table_name_with_ptransform

    # [START model_custom_sink_use_ptransform]
    with beam.Pipeline(options=PipelineOptions()) as p:
        kvs = p | 'CreateKVs' >> beam.core.Create(KVs)
        kvs | 'WriteToSimpleKV' >> WriteToKVSink('http://url_to_simple_kv/',
                                                 final_table_name)
Esempio n. 12
0
        for f in [18, 20, 21]:  #wheelson, crsarrtime, arrtime
            fields[f], arrtz = as_utc(fields[0], fields[f], arr_timezone)

        for f in [17, 18, 20, 21]:
            fields[f] = add_24h_if_before(fields[f], fields[14])

        fields.extend(airport_timezones[dep_airport_id])
        fields[-1] = str(deptz)
        fields.extend(airport_timezones[arr_airport_id])
        fields[-1] = str(arrtz)

        yield ','.join(fields)


if __name__ == '__main__':
    with beam.Pipeline('DirectRunner') as pipeline:

        airports = (pipeline
                    |
                    'airports:read' >> beam.io.ReadFromText('airports.csv.gz')
                    | 'airports:fields' >>
                    beam.Map(lambda line: next(csv.reader([line])))
                    | 'airports:tz' >>
                    beam.Map(lambda fields:
                             (fields[0], addtimezone(fields[21], fields[26]))))

        flights = (pipeline
                   | 'flights:read' >> beam.io.ReadFromText('201501_part.csv')
                   | 'flights:tzcorr' >> beam.FlatMap(
                       tz_correct, beam.pvalue.AsDict(airports)))
Esempio n. 13
0
def run(argv=None):
    """Main entry point; defines and runs the hourly_team_score pipeline."""
    parser = argparse.ArgumentParser()

    parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
    parser.add_argument('--subscription',
                        type=str,
                        help='Pub/Sub subscription to read from')
    parser.add_argument('--dataset',
                        type=str,
                        required=True,
                        help='BigQuery Dataset to write tables to. '
                        'Must already exist.')
    parser.add_argument(
        '--table_name',
        type=str,
        default='game_stats',
        help='The BigQuery table name. Should not already exist.')
    parser.add_argument('--fixed_window_duration',
                        type=int,
                        default=60,
                        help='Numeric value of fixed window duration for user '
                        'analysis, in minutes')
    parser.add_argument('--session_gap',
                        type=int,
                        default=5,
                        help='Numeric value of gap between user sessions, '
                        'in minutes')
    parser.add_argument(
        '--user_activity_window_duration',
        type=int,
        default=30,
        help='Numeric value of fixed window for finding mean of '
        'user session duration, in minutes')

    args, pipeline_args = parser.parse_known_args(argv)

    if args.topic is None and args.subscription is None:
        parser.print_usage()
        print(sys.argv[0] +
              ': error: one of --topic or --subscription is required')
        sys.exit(1)

    options = PipelineOptions(pipeline_args)

    # We also require the --project option to access --dataset
    if options.view_as(GoogleCloudOptions).project is None:
        parser.print_usage()
        print(sys.argv[0] + ': error: argument --project is required')
        sys.exit(1)

    fixed_window_duration = args.fixed_window_duration * 60
    session_gap = args.session_gap * 60
    user_activity_window_duration = args.user_activity_window_duration * 60

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = True

    # Enforce that this pipeline is always run in streaming mode
    options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=options) as p:
        # Read game events from Pub/Sub using custom timestamps, which
        # are extracted from the data elements, and parse the data.
        if args.subscription:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=args.subscription)
        else:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=args.topic)
        raw_events = (scores
                      | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
                      | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
                      | 'AddEventTimestamps' >>
                      beam.Map(lambda elem: beam.window.TimestampedValue(
                          elem, elem['timestamp'])))

        # Extract username/score pairs from the event stream
        user_events = (raw_events
                       | 'ExtractUserScores' >>
                       beam.Map(lambda elem: (elem['user'], elem['score'])))

        # Calculate the total score per user over fixed windows, and cumulative
        # updates for late data
        spammers_view = (
            user_events
            | 'UserFixedWindows' >> beam.WindowInto(
                beam.window.FixedWindows(fixed_window_duration))

            # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
            # These might be robots/spammers.
            | 'CalculateSpammyUsers' >> CalculateSpammyUsers()

            # Derive a view from the collection of spammer users. It will be used as
            # a side input in calculating the team score sums, below
            | 'CreateSpammersView' >> beam.CombineGlobally(
                beam.combiners.ToDictCombineFn()).as_singleton_view())

        # [START filter_and_calc]
        # Calculate the total score per team over fixed windows, and emit cumulative
        # updates for late data. Uses the side input derived above --the set of
        # suspected robots-- to filter out scores from those users from the sum.
        # Write the results to BigQuery.
        (raw_events  # pylint: disable=expression-not-assigned
         | 'WindowIntoFixedWindows' >> beam.WindowInto(
             beam.window.FixedWindows(fixed_window_duration))

         # Filter out the detected spammer users, using the side input derived above
         | 'FilterOutSpammers' >> beam.Filter(
             lambda elem, spammers: elem['user'] not in spammers,
             spammers_view)
         # Extract and sum teamname/score pairs from the event data.
         | 'ExtractAndSumScore' >> ExtractAndSumScore('team')
         # [END filter_and_calc]
         | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
         | 'WriteTeamScoreSums' >> WriteToBigQuery(
             args.table_name + '_teams', args.dataset, {
                 'team': 'STRING',
                 'total_score': 'INTEGER',
                 'window_start': 'STRING',
                 'processing_time': 'STRING',
             },
             options.view_as(GoogleCloudOptions).project))

        # [START session_calc]
        # Detect user sessions-- that is, a burst of activity separated by a gap
        # from further activity. Find and record the mean session lengths.
        # This information could help the game designers track the changing user
        # engagement as their set of game changes.
        (user_events  # pylint: disable=expression-not-assigned
         | 'WindowIntoSessions' >> beam.WindowInto(
             beam.window.Sessions(session_gap),
             timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW)

         # For this use, we care only about the existence of the session, not any
         # particular information aggregated over it, so we can just group by key
         # and assign a "dummy value" of None.
         | beam.CombinePerKey(lambda _: None)

         # Get the duration of the session
         | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity())
         # [END session_calc]

         # [START rewindow]
         # Re-window to process groups of session sums according to when the
         # sessions complete
         | 'WindowToExtractSessionMean' >> beam.WindowInto(
             beam.window.FixedWindows(user_activity_window_duration))

         # Find the mean session duration in each window
         | beam.CombineGlobally(
             beam.combiners.MeanCombineFn()).without_defaults()
         | 'FormatAvgSessionLength' >>
         beam.Map(lambda elem: {'mean_duration': float(elem)})
         | 'WriteAvgSessionLength' >> WriteToBigQuery(
             args.table_name + '_sessions', args.dataset, {
                 'mean_duration': 'FLOAT',
             },
             options.view_as(GoogleCloudOptions).project))
        record = element
        name = record.get('name')
        newname = name.strip('""')
        record['newname'] = newname
        return [record]


PROJECT_ID = os.environ['PROJECT_ID']

# Project ID is needed for BigQuery data source, even for local execution.
options = {'project': PROJECT_ID}

opts = beam.pipeline.PipelineOptions(flags=[], **options)

# Create a Pipeline using a local runner for execution.
with beam.Pipeline('DirectRunner', options=opts) as p:

    # Select data from Dish table in BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(
        beam.io.BigQuerySource(
            query='SELECT * FROM dataset2.Business limit 100'))

    # write PCollection to log file
    query_results | 'Write to log 1' >> WriteToText('query_results.txt')

    # apply a ParDo to the PCollection
    Bis_pcoll = query_results | 'Normalize Business' >> beam.ParDo(Oops())

    # write PCollection to a file
    Bis_pcoll | 'Write File' >> WriteToText('Bus_output.txt')
Esempio n. 15
0
 def testWriteIgnoresMissingKeys(self):
     with beam.Pipeline() as pipeline:
         test = pipeline | beam.Create(['test'])
         # PTransform is None so this will throw exception if it tries to run
         _ = {'test': test} | writer.Write('key-does-not-exist', None)
def run():

    client_bq = bigquery.Client.from_service_account_json(args.local_sa_key, location=args.location)
    bigquery_asset_list = [
      ('logs', 'events_logs_function_native', 'event_ds'),
      ('logs', 'events_debug_function_native', 'event_ds'),
      ('logs', 'events_logs_dataflow_backfill', 'event_ds'),
      ('events', 'events_function_native', 'event_timestamp')]
    try:
        source_bigquery_assets(client_bq, bigquery_asset_list)
    except Exception:
        generate_bigquery_assets(client_bq, bigquery_asset_list)

    # https://github.com/apache/beam/blob/master/sdks/python/apache_beam/options/pipeline_options.py
    po = PipelineOptions()
    job_name = 'p1-gcs-to-bq-{method}-backfill-{environment_name}-{event_category}-{event_ds_start}-to-{event_ds_stop}-{event_time}-{ts}'.format(
      method=method, environment_name=environment_name, event_category=args.event_category.replace('_', '-'), event_ds_start=args.event_ds_start, event_ds_stop=args.event_ds_stop, event_time=time_part_name, ts=str(int(time.time())))
    # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
    pipeline_options = po.from_dictionary({
      'project': args.gcp,
      'staging_location': 'gs://{bucket_name}/data_type=dataflow/batch/staging/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name),
      'temp_location': 'gs://{bucket_name}/data_type=dataflow/batch/temp/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name),
      'runner': args.execution_environment,  # {DirectRunner, DataflowRunner}
      'setup_file': args.setup_file,
      'service_account_email': 'dataflow-batch@{gcp_project_id}.iam.gserviceaccount.com'.format(gcp_project_id=args.gcp),
      'job_name': job_name,
      'region': args.gcp_region
      })
    pipeline_options.view_as(SetupOptions).save_main_session = True

    p1 = beam.Pipeline(options=pipeline_options)
    fileListGcs = (p1 | 'CreateGcsIterators' >> beam.Create(list(generate_gcs_file_list(args.bucket_name, environment_list, category_list, args.event_ds_start, args.event_ds_stop, time_part_list, args.scale_test_name)))
                      | 'GetGcsFileList' >> beam.ParDo(GetGcsFileList())
                      | 'GcsListPairWithOne' >> beam.Map(lambda x: (x, 1)))

    fileListBq = (p1 | 'ParseBqFileList' >> beam.io.Read(beam.io.BigQuerySource(
                        # "What is already in BQ?"
                        query=generate_backfill_query(
                          args.gcp,
                          method,
                          (safe_convert_list_to_sql_tuple(environment_list), environment_name),
                          (safe_convert_list_to_sql_tuple(category_list), category_name),
                          args.event_ds_start,
                          args.event_ds_stop,
                          (safe_convert_list_to_sql_tuple(time_part_list), time_part_name),
                          args.scale_test_name),
                        use_standard_sql=True))
                     | 'BqListPairWithOne' >> beam.Map(lambda x: (x['gspath'], 1)))


    parseList = ({'fileListGcs': fileListGcs, 'fileListBq': fileListBq}
                 | 'CoGroupByKey' >> beam.CoGroupByKey()
                 | 'UnionMinusIntersect' >> beam.Filter(lambda x: (len(x[1]['fileListGcs']) == 1 and len(x[1]['fileListBq']) == 0))
                 | 'ExtractKeysParseList' >> beam.Map(lambda x: x[0]))

    # Write to BigQuery:
    logsList = (parseList | 'AddParseInitiatedInfo' >> beam.Map(lambda gspath: {'job_name': job_name,
                                                                                'processed_timestamp': time.time(),
                                                                                'batch_id': hashlib.md5(gspath.encode('utf-8')).hexdigest(),
                                                                                'analytics_environment': parse_gspath(gspath, 'analytics_environment='),
                                                                                'event_category': parse_gspath(gspath, 'event_category='),
                                                                                'event_ds': parse_gspath(gspath, 'event_ds='),
                                                                                'event_time': parse_gspath(gspath, 'event_time='),
                                                                                'event': 'parse_initiated',
                                                                                'gspath': gspath})
                          | 'WriteParseInitiated' >> beam.io.WriteToBigQuery(table='events_logs_dataflow_backfill',
                                                                             dataset='logs',
                                                                             project=args.gcp,
                                                                             method='FILE_LOADS',
                                                                             create_disposition=beam.io.gcp.bigquery.BigQueryDisposition.CREATE_IF_NEEDED,
                                                                             write_disposition=beam.io.gcp.bigquery.BigQueryDisposition.WRITE_APPEND,
                                                                             insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.RETRY_ON_TRANSIENT_ERROR,
                                                                             schema='job_name:STRING,processed_timestamp:TIMESTAMP,batch_id:STRING,analytics_environment:STRING,event_category:STRING,event_ds:DATE,event_time:STRING,event:STRING,gspath:STRING'))

    # Write to Pub/Sub:
    PDone = (parseList | 'DumpParseListPubSub' >> beam.io.WriteToText('gs://{bucket_name}/data_type=dataflow/batch/output/{job_name}/parselist'.format(bucket_name=args.bucket_name, job_name=job_name))
                       | 'WriteToPubSub' >> beam.ParDo(WriteToPubSub(), job_name, args.topic, args.gcp, args.bucket_name))


    p1.run().wait_until_finish()
    return job_name
Esempio n. 17
0
    def testPreprocessingFn(self):
        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        feature_spec = taxi_utils_bqml._get_raw_feature_spec(schema)
        working_dir = self.get_temp_dir()
        transform_output_path = os.path.join(working_dir, 'transform_output')
        transformed_examples_path = os.path.join(working_dir,
                                                 'transformed_examples')

        # Run very simplified version of executor logic.
        # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
        # Generate legacy `DatasetMetadata` object.  Future version of Transform
        # will accept the `Schema` proto directly.
        legacy_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(feature_spec))
        decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema)
        with beam.Pipeline() as p:
            with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
                examples = (
                    p
                    | 'ReadTrainData' >> beam.io.ReadFromTFRecord(
                        os.path.join(self._testdata_path,
                                     'csv_example_gen/train/*'),
                        coder=beam.coders.BytesCoder(),
                        # TODO(b/114938612): Eventually remove this override.
                        validate=False)
                    | 'DecodeTrainData' >> beam.Map(decoder.decode))
                (transformed_examples, transformed_metadata), transform_fn = (
                    (examples, legacy_metadata)
                    | 'AnalyzeAndTransform' >>
                    tft_beam.AnalyzeAndTransformDataset(
                        taxi_utils_bqml.preprocessing_fn))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                # pylint: disable=expression-not-assigned
                (transform_fn
                 | 'WriteTransformFn' >>
                 tft_beam.WriteTransformFn(transform_output_path))

                encoder = tft.coders.ExampleProtoCoder(
                    transformed_metadata.schema)
                (transformed_examples
                 | 'EncodeTrainData' >> beam.Map(encoder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(transformed_examples_path,
                                  'train/transformed_examples.gz'),
                     coder=beam.coders.BytesCoder()))
                # pylint: enable=expression-not-assigned

        # Verify the output matches golden output.
        # NOTE: we don't verify that transformed examples match golden output.
        expected_transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(
                self._testdata_path,
                'transform/transform_output/transformed_metadata/schema.pbtxt'
            ), schema_pb2.Schema())
        transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(transform_output_path,
                         'transformed_metadata/schema.pbtxt'),
            schema_pb2.Schema())
        # Clear annotations so we only have to test main schema.
        for feature in transformed_schema.feature:
            feature.ClearField('annotation')
        transformed_schema.ClearField('annotation')
        self.assertEqual(transformed_schema, expected_transformed_schema)
Esempio n. 18
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Ensure that the experiment flag is set explicitly by the user.
    debug_options = pipeline_options.view_as(DebugOptions)
    use_fn_api = (debug_options.experiments
                  and 'beam_fn_api' in debug_options.experiments)
    assert use_fn_api, 'Enable beam_fn_api experiment, in order run this example.'

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    counts = (
        lines
        | 'split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(six.text_type))
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'group_and_sum' >> beam.CombinePerKey(sum))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %s' % (word, count)

    # pylint: disable=unused-variable
    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned

    # TODO(BEAM-2887): Enable after the issue is fixed.
    # output | 'write' >> WriteToText(known_args.output)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.committed)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.committed.mean)
Esempio n. 19
0
    header_str = ','.join(bq_table_columns)
    if header_str == text:
        return False
    else:
        return True


# Convert the csv string into a python dictionary
def str_to_dict(text):
    vals_list = text.split(',')
    vals_dict = {}
    for val, col in zip(vals_list, bq_table_columns):
        vals_dict[col] = val
    return vals_dict


# Define pipeline steps in pipeline object (p)
with beam.Pipeline(options=options) as p:
    pipe = (
        p
        | "Input" >> beam.io.ReadFromText(
            'gs://fw-etl-tmp-prod/FWrates_tender_zip3_forecast_mu.csv')
        | "Remove Header" >> beam.Filter(is_data)
        | "Convert To Dict" >> beam.Map(str_to_dict)
        | "Load BQ table" >> beam.io.WriteToBigQuery(
            forecast_table_spec,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER)
        #| beam.Map(print)
    )
Esempio n. 20
0
def generate_examples(input_transform,
                      output_dir,
                      problem_name,
                      splits,
                      min_hop_size_seconds,
                      max_hop_size_seconds,
                      num_replications,
                      min_pitch,
                      max_pitch,
                      encode_performance_fn,
                      encode_score_fns=None,
                      augment_fns=None,
                      absolute_timing=False,
                      random_crop_length=None):
    """Generate data for a Score2Perf problem.

  Args:
    input_transform: The input PTransform object that reads input NoteSequence
        protos, or dictionary mapping split names to such PTransform objects.
        Should produce `(id, NoteSequence)` tuples.
    output_dir: The directory to write the resulting TFRecord file containing
        examples.
    problem_name: Name of the Tensor2Tensor problem, used as a base filename
        for generated data.
    splits: A dictionary of split names and their probabilities. Probabilites
        should add up to 1. If `input_filename` is a dictionary, this argument
        will be ignored.
    min_hop_size_seconds: Minimum hop size in seconds at which input
        NoteSequence protos can be split. Can also be a dictionary mapping split
        name to minimum hop size.
    max_hop_size_seconds: Maximum hop size in seconds at which input
        NoteSequence protos can be split. If zero or None, will not split at
        all. Can also be a dictionary mapping split name to maximum hop size.
    num_replications: Number of times input NoteSequence protos will be
        replicated prior to splitting.
    min_pitch: Minimum MIDI pitch value; notes with lower pitch will be dropped.
    max_pitch: Maximum MIDI pitch value; notes with greater pitch will be
        dropped.
    encode_performance_fn: Required performance encoding function.
    encode_score_fns: Optional dictionary of named score encoding functions.
    augment_fns: Optional list of data augmentation functions. Only applied in
        the 'train' split.
    absolute_timing: If True, each score will use absolute instead of tempo-
        relative timing. Since chord inference depends on having beats, the
        score will only contain melody.
    random_crop_length: If specified, crop each encoded performance to this
        length. Cannot be specified if using scores.

  Raises:
    ValueError: If split probabilities do not add up to 1, or if splits are not
        provided but `input_filename` is not a dictionary.
  """
    # Make sure Beam's log messages are not filtered.
    logging.getLogger().setLevel(logging.INFO)

    if isinstance(input_transform, dict):
        split_names = input_transform.keys()
    else:
        if not splits:
            raise ValueError(
                'Split probabilities must be provided if input is not presplit.'
            )
        split_names, split_probabilities = zip(*splits.items())
        cumulative_splits = list(
            zip(split_names, np.cumsum(split_probabilities)))
        if cumulative_splits[-1][1] != 1.0:
            raise ValueError('Split probabilities must sum to 1; got %f' %
                             cumulative_splits[-1][1])

    # Check for existence of prior outputs. Since the number of shards may be
    # different, the prior outputs will not necessarily be overwritten and must
    # be deleted explicitly.
    output_filenames = [
        os.path.join(output_dir, '%s-%s.tfrecord' % (problem_name, split_name))
        for split_name in split_names
    ]
    for split_name, output_filename in zip(split_names, output_filenames):
        existing_output_filenames = tf.gfile.Glob(output_filename + '*')
        if existing_output_filenames:
            tf.logging.info(
                'Data files already exist for split %s in problem %s, deleting.',
                split_name, problem_name)
            for filename in existing_output_filenames:
                tf.gfile.Remove(filename)

    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options.split(','))

    with beam.Pipeline(options=pipeline_options) as p:
        if isinstance(input_transform, dict):
            # Input data is already partitioned into splits.
            split_partitions = [
                p | 'input_transform_%s' % split_name >>
                input_transform[split_name] for split_name in split_names
            ]
        else:
            # Read using a single PTransform.
            p |= 'input_transform' >> input_transform
            split_partitions = p | 'partition' >> beam.Partition(
                functools.partial(select_split, cumulative_splits),
                len(cumulative_splits))

        for split_name, output_filename, s in zip(split_names,
                                                  output_filenames,
                                                  split_partitions):
            if isinstance(min_hop_size_seconds, dict):
                min_hop = min_hop_size_seconds[split_name]
            else:
                min_hop = min_hop_size_seconds
            if isinstance(max_hop_size_seconds, dict):
                max_hop = max_hop_size_seconds[split_name]
            else:
                max_hop = max_hop_size_seconds
            s |= 'preshuffle_%s' % split_name >> beam.Reshuffle()
            s |= 'filter_invalid_notes_%s' % split_name >> beam.Map(
                functools.partial(filter_invalid_notes, min_pitch, max_pitch))
            s |= 'extract_examples_%s' % split_name >> beam.ParDo(
                ExtractExamplesDoFn(
                    min_hop, max_hop, num_replications if split_name == 'train'
                    else 1, encode_performance_fn, encode_score_fns,
                    augment_fns if split_name == 'train' else None,
                    absolute_timing, random_crop_length))
            s |= 'shuffle_%s' % split_name >> beam.Reshuffle()
            s |= 'write_%s' % split_name >> beam.io.WriteToTFRecord(
                output_filename,
                coder=beam.coders.ProtoCoder(tf.train.Example))
import apache_beam as beam
import re

inputs_pattern = 'SalesJan2009.csv'
outputs_prefix = 'outputs/part'


class SplitWords(beam.DoFn):
    def __init__(self, header):
        self.header = header

    def process(self, text):
        yield text


# Running locally in the DirectRunner.
header = "Date,Product,Price,Card,Country"
with beam.Pipeline() as pipeline:
    (pipeline
     | 'Read lines' >> beam.io.ReadFromText(inputs_pattern)
     | 'Par Do' >> beam.ParDo(SplitWords(header))
     | 'SUm the stuff' >> beam.combiners.Count.PerElement()
     #| 'Find words' >> beam.FlatMap(lambda line: re.split(",", line))
     | 'Format results' >> beam.Map(print)
     #| 'Write results' >> beam.io.WriteToText(outputs_prefix)
     )
Esempio n. 22
0
 def create_pipeline(self):
   return beam.Pipeline(runner=fn_api_runner.FnApiRunner())
 def test_native_source(self):
     with beam.Pipeline(argv=self.args) as p:
         result = (p | 'read' >> beam.io.Read(
             beam.io.BigQuerySource(query=self.query,
                                    use_standard_sql=True)))
         assert_that(result, equal_to(self.get_expected_data()))
Esempio n. 24
0
 def create_pipeline(self):
   return beam.Pipeline(
       runner=fn_api_runner.FnApiRunner(
           default_environment=beam_runner_api_pb2.Environment(
               urn=python_urns.EMBEDDED_PYTHON_GRPC,
               payload=b'2')))
  def testEvaluateWithSlicingAndUncertainty(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir,
        add_metrics_callbacks=[_addExampleCountMetricCallback])
    extractors = [
        predict_extractor.PredictExtractor(eval_shared_model),
        slice_key_extractor.SliceKeyExtractor([
            slicer.SingleSliceSpec(),
            slicer.SingleSliceSpec(columns=['slice_key'])
        ])
    ]

    for batch_size in [1, 2, 4, 8]:

      with beam.Pipeline() as pipeline:
        example1 = self._makeExample(
            age=3.0, language='english', label=1.0, slice_key='first_slice')
        example2 = self._makeExample(
            age=3.0, language='chinese', label=0.0, slice_key='first_slice')
        example3 = self._makeExample(
            age=4.0, language='english', label=0.0, slice_key='second_slice')
        example4 = self._makeExample(
            age=5.0, language='chinese', label=1.0, slice_key='second_slice')
        example5 = self._makeExample(
            age=5.0, language='chinese', label=1.0, slice_key='second_slice')

        (metrics, _), _ = (
            pipeline
            | 'Create' >> beam.Create([
                example1.SerializeToString(),
                example2.SerializeToString(),
                example3.SerializeToString(),
                example4.SerializeToString(),
                example5.SerializeToString(),
            ])
            | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
            | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
            | 'ComputeMetricsAndPlots' >>
            metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                eval_shared_model=eval_shared_model,
                desired_batch_size=batch_size,
                compute_confidence_intervals=True))

        def check_result(got):
          try:
            self.assertEqual(3, len(got), 'got: %s' % got)
            slices = {}
            for slice_key, value in got:
              slices[slice_key] = value
            overall_slice = ()
            first_slice = (('slice_key', b'first_slice'),)
            second_slice = (('slice_key', b'second_slice'),)
            self.assertCountEqual(
                list(slices.keys()), [overall_slice, first_slice, second_slice])
            self.assertDictElementsWithTDistributionAlmostEqual(
                slices[overall_slice], {
                    'accuracy': 0.4,
                    'label/mean': 0.6,
                    'my_mean_age': 4.0,
                    'my_mean_age_times_label': 2.6,
                    'added_example_count': 5.0
                })
            self.assertDictElementsWithTDistributionAlmostEqual(
                slices[first_slice], {
                    'accuracy': 1.0,
                    'label/mean': 0.5,
                    'my_mean_age': 3.0,
                    'my_mean_age_times_label': 1.5,
                    'added_example_count': 2.0
                })
            self.assertDictElementsWithTDistributionAlmostEqual(
                slices[second_slice], {
                    'accuracy': 0.0,
                    'label/mean': 2.0 / 3.0,
                    'my_mean_age': 14.0 / 3.0,
                    'my_mean_age_times_label': 10.0 / 3.0,
                    'added_example_count': 3.0
                })

          except AssertionError as err:
            # This function is redefined every iteration, so it will have the
            # right value of batch_size.
            raise util.BeamAssertException('batch_size = %d, error: %s' %
                                           (batch_size, err))  # pylint: disable=cell-var-from-loop

        util.assert_that(metrics, check_result, label='metrics')
Esempio n. 26
0
 def create_pipeline(self):
   return beam.Pipeline(
       runner=fn_api_runner.FnApiRunner(bundle_repeat=3))
  def testEvaluateWithPlots(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = (
        fixed_prediction_estimator.simple_fixed_prediction_estimator(
            None, temp_eval_export_dir))
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir,
        add_metrics_callbacks=[
            post_export_metrics.example_count(),
            post_export_metrics.auc_plots()
        ])
    extractors = [
        predict_extractor.PredictExtractor(eval_shared_model),
        slice_key_extractor.SliceKeyExtractor()
    ]

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(prediction=0.0, label=1.0)
      example2 = self._makeExample(prediction=0.7, label=0.0)
      example3 = self._makeExample(prediction=0.8, label=1.0)
      example4 = self._makeExample(prediction=1.0, label=1.0)

      (metrics, plots), _ = (
          pipeline
          | 'Create' >> beam.Create([
              example1.SerializeToString(),
              example2.SerializeToString(),
              example3.SerializeToString(),
              example4.SerializeToString()
          ])
          | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
          | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
          | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator
          .ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

      def check_metrics(got):
        try:
          self.assertEqual(1, len(got), 'got: %s' % got)
          (slice_key, value) = got[0]
          self.assertEqual((), slice_key)
          self.assertDictElementsAlmostEqual(
              got_values_dict=value,
              expected_values_dict={
                  metric_keys.EXAMPLE_COUNT: 4.0,
              })
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(metrics, check_metrics, label='metrics')

      def check_plots(got):
        try:
          self.assertEqual(1, len(got), 'got: %s' % got)
          (slice_key, value) = got[0]
          self.assertEqual((), slice_key)
          self.assertDictMatrixRowsAlmostEqual(
              got_values_dict=value,
              expected_values_dict={
                  metric_keys.AUC_PLOTS_MATRICES: [
                      (8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0])
                  ],
              })
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(plots, check_plots, label='plots')
Esempio n. 28
0
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText


class AppendDoFn(beam.DoFn):
    def process(self, element):
        return element + " - Hello World!"


parser = argparse.ArgumentParser()
parser.add_argument('--input',
                    dest='input',
                    default='gs://dataflow-samples/shakespeare/kinglear.txt')
parser.add_argument(
    '--output',
    dest='output',
    default='gs://dsp_model_store_famenor/shakespeare/kinglear.txt')

known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)

p = beam.Pipeline(options=pipeline_options)
lines = p | 'read' >> ReadFromText(known_args.input)
appended = lines | 'append' >> beam.ParDo(AppendDoFn())
appended | 'write' >> WriteToText(known_args.output)

result = p.run()
result.wait_until_finish()
Esempio n. 29
0
# run pipeline on Dataflow
options = {
    'runner': 'DataflowRunner',
    'job_name': 'nomination-count-10',
    'project': PROJECT_ID,
    'temp_location': BUCKET + '/temp',
    'staging_location': BUCKET + '/staging',
    'machine_type':
    'n1-standard-1',  # machine types listed here: https://cloud.google.com/compute/docs/machine-types
    'num_workers': 1
}

opts = PipelineOptions(flags=[], **options)

with beam.Pipeline('DataflowRunner', options=opts) as p:

    # create PCollection from the file contents
    in_pcoll = p | 'Read File' >> ReadFromText(DIR_PATH_IN + 'oscars_data.tsv')

    # apply a ParDo to the PCollection
    out_pcoll = in_pcoll | 'Extract Actor and Actress' >> beam.ParDo(
        ActorActressCountFn()).with_outputs(
            ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT,
            ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT)

    actor_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT]
    actress_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT]

    # write PCollections to files
    actor_pcoll | 'Write Actor File 1' >> WriteToText(DIR_PATH_OUT +
Esempio n. 30
0
def pipeline(config_map, dataset_config_map, preprocess_example_fn,
             input_tensors_to_example_fn):
    """Pipeline for dataset creation."""
    tf.flags.mark_flags_as_required(['output_directory'])

    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options.split(','))

    config = config_map[FLAGS.config]
    hparams = config.hparams
    hparams.parse(FLAGS.hparams)

    datasets = dataset_config_map[FLAGS.dataset_config]

    if tf.gfile.Exists(FLAGS.output_directory):
        raise ValueError('Output directory %s already exists!' %
                         FLAGS.output_directory)
    tf.gfile.MakeDirs(FLAGS.output_directory)
    with tf.gfile.Open(os.path.join(FLAGS.output_directory, 'config.txt'),
                       'w') as f:
        f.write('\n\n'.join([
            'min_length: {}'.format(FLAGS.min_length),
            'max_length: {}'.format(FLAGS.max_length),
            'sample_rate: {}'.format(FLAGS.sample_rate),
            'preprocess_examples: {}'.format(FLAGS.preprocess_examples),
            'preprocess_train_example_multiplier: {}'.format(
                FLAGS.preprocess_train_example_multiplier),
            'config: {}'.format(FLAGS.config),
            'hparams: {}'.format(hparams.to_json(sort_keys=True)),
            'dataset_config: {}'.format(FLAGS.dataset_config),
            'datasets: {}'.format(datasets),
        ]))

    with beam.Pipeline(options=pipeline_options) as p:
        for dataset in datasets:
            if isinstance(dataset.path, (list, tuple)):
                # If dataset.path is a list, then it's a list of sources to mix together
                # to form new examples. First, do the mixing, then pass the results to
                # the rest of the pipeline.
                id_exs = []
                sourceid_to_exids = []
                for source_id, stem_path in enumerate(dataset.path):
                    if dataset.num_mixes is None:
                        raise ValueError(
                            'If path is not a list, num_mixes must not be None: {}'
                            .format(dataset))
                    stem_p = p | 'tfrecord_list_%s_%d' % (
                        dataset.name, source_id) >> (beam.Create(
                            data.generate_sharded_filenames(stem_path)))

                    # Note that we do not specify a coder when reading here.
                    # This is so that the hashing in key_example below can work directly
                    # on the serialized version instead of having to re-serialize it.
                    # Also, deserializing with a coder and then re-serializing does not
                    # always generate the same hash for the same example (likely due to
                    # the map fields in tf.train.Example). This is important when reading
                    # the same dataset multiple times to mix it with itself.
                    stem_p |= 'read_tfrecord_%s_%d' % (
                        dataset.name, source_id) >> (
                            beam.io.tfrecordio.ReadAllFromTFRecord())
                    stem_p |= 'shuffle_stems_%s_%d' % (
                        dataset.name, source_id) >> (beam.Reshuffle())

                    # Key all examples with a hash.
                    def key_example(ex):
                        return (hashlib.sha256(ex).hexdigest(), ex)

                    stem_p |= 'add_id_key_%s_%d' % (
                        dataset.name, source_id) >> (beam.Map(key_example))
                    id_exs.append(stem_p)

                    # Create a list of source_id to example id.
                    def sourceid_to_exid(id_ex, source_id):
                        return (source_id, id_ex[0])

                    sourceid_to_exids.append(
                        stem_p | 'key_%s_%d' % (dataset.name, source_id) >>
                        (beam.Map(sourceid_to_exid, source_id=source_id)))

                # ('example_hash', serialized_example)
                id_exs = (
                    id_exs
                    | 'id_exs_flatten_%s' % dataset.name >> beam.Flatten()
                    | 'id_exs_distinct_%s' % dataset.name >> beam.Distinct())

                # ('source_id, 'example_hash')
                sourceid_to_exids = (sourceid_to_exids
                                     | 'sourceid_to_exids_flatten_%s' %
                                     dataset.name >> beam.Flatten())

                # Pass the list of source id to example IDs to generate_mixes,
                # which will create mixes by selecting random IDs from each source
                # (with replacement). This is represented as a list of example IDs
                # to Mix IDs.
                # Note: beam.Create([0]) is just a single dummy value to allow the
                # sourceid_to_exids to be passed in as a python list so we can do the
                # sampling with numpy.
                exid_to_mixids = (
                    p
                    | 'create_dummy_%s' % dataset.name >> beam.Create([0])
                    | 'generate_mixes_%s' % dataset.name >> beam.Map(
                        create_dataset_lib.generate_mixes,
                        num_mixes=dataset.num_mixes,
                        sourceid_to_exids=beam.pvalue.AsList(
                            sourceid_to_exids)))

                # Create a list of (Mix ID, Full Example proto). Note: Examples may be
                # present in more than one mix. Then, group by Mix ID.
                def mixid_to_exs(id_ex, exid_to_mixids):
                    exid, ex = id_ex
                    for mixid in exid_to_mixids[exid]:
                        yield mixid, ex

                mixid_exs = (
                    id_exs
                    | 'mixid_to_exs_%s' % dataset.name >> beam.FlatMap(
                        mixid_to_exs,
                        exid_to_mixids=beam.pvalue.AsSingleton(exid_to_mixids))
                    | 'group_by_key_%s' % dataset.name >> beam.GroupByKey())
                # Take these groups of Examples, mix their audio and sequences to return
                # a single new Example. Then, carry on with the rest of the pipeline
                # like normal.
                split_p = (mixid_exs
                           | 'mix_examples_%s' % dataset.name >> beam.Map(
                               mix_examples, FLAGS.sample_rate,
                               FLAGS.load_audio_with_librosa))
            else:
                if dataset.num_mixes is not None:
                    raise ValueError(
                        'If path is not a list, num_mixes must be None: {}'.
                        format(dataset))
                split_p = p | 'tfrecord_list_%s' % dataset.name >> beam.Create(
                    data.generate_sharded_filenames(dataset.path))
                split_p |= 'read_tfrecord_%s' % dataset.name >> (
                    beam.io.tfrecordio.ReadAllFromTFRecord(
                        coder=beam.coders.ProtoCoder(tf.train.Example)))
            split_p |= 'shuffle_input_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'split_wav_%s' % dataset.name >> beam.FlatMap(
                split_wav,
                min_length=FLAGS.min_length,
                max_length=FLAGS.max_length,
                sample_rate=FLAGS.sample_rate,
                debug_output_directory=FLAGS.output_directory,
                split_example=dataset.process_for_training,
                load_audio_with_librosa=FLAGS.load_audio_with_librosa)
            if FLAGS.preprocess_examples:
                if dataset.process_for_training:
                    mul_name = 'preprocess_multiply_%dx_%s' % (
                        FLAGS.preprocess_train_example_multiplier,
                        dataset.name)
                    split_p |= mul_name >> beam.FlatMap(
                        multiply_example,
                        FLAGS.preprocess_train_example_multiplier)
                split_p |= 'preprocess_%s' % dataset.name >> beam.Map(
                    preprocess_data, preprocess_example_fn,
                    input_tensors_to_example_fn, hparams,
                    dataset.process_for_training)
            split_p |= 'shuffle_output_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'write_%s' % dataset.name >> beam.io.WriteToTFRecord(
                os.path.join(FLAGS.output_directory,
                             '%s.tfrecord' % dataset.name),
                coder=beam.coders.ProtoCoder(tf.train.Example))