def run_metrics(): """Creates a pipeline to measure wordpiece vocab metrics over a corpus.""" metrics_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( metrics_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (metrics_transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn(FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))) # Initialize CSV coder. Aggregate values for each lang, calculate metrics, # and write to output to a CSV file. csv_converter = tft.coders.CsvCoder(columns, csv_schema) _ = ( metrics_transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText( metrics_file, shard_name_template='', header=','.join(columns))) return metrics_pipeline
def setUp(self): super(AggregateLangTest, self).setUp() self.aggregator = utils.AggregateLang() self.sample_input = [{ 'lang': 'en', 'count': 1, 'num_preserved_chars': 13, 'num_dropped_chars': 2, 'num_non_unk_wordpieces': 4, 'preserved_ratio': [13 / 4], 'dropped_ratio': [2 / 15], 'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce']) }, { 'lang': 'en', 'count': 1, 'num_preserved_chars': 11, 'num_dropped_chars': 0, 'num_non_unk_wordpieces': 4, 'preserved_ratio': [11 / 4], 'dropped_ratio': [0], 'wordpieces': collections.Counter(['the', 'app', 'st', '##ore']) }]
def calculate_metrics(): """Returns a pipeline to compute wordpiece model stats given a vocab and corpus.""" # Schema of input dataset. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) # Schema to format metrics as CSV. csv_schema = dataset_schema.from_feature_spec({ 'lang': tf.FixedLenFeature([], tf.string), 'sample_count': tf.FixedLenFeature([], tf.int64), 'micro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'macro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'micro_compress_ratio': tf.FixedLenFeature([], tf.string), 'macro_compress_ratio': tf.FixedLenFeature([], tf.string), 'unweighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), 'weighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), }) columns = [ 'lang', 'sample_count', 'micro_drop_char_percent', 'macro_drop_char_percent', 'micro_compress_ratio', 'macro_compress_ratio', 'unweighted_en_wp_overlap_percent', 'weighted_en_wp_overlap_percent' ] # Create pipeline. pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema, serialized=False) csv_converter = tft.coders.CsvCoder(columns, csv_schema) # Read raw data and convert to TF Transform encoded dict. raw_data = (pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( FLAGS.input_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn( FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key)) ) # Aggregate values for each lang, calculate metrics, and write to output. _ = (transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally( utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file, shard_name_template='', header=','.join(columns))) return pipeline