def run_metrics(): """Creates a pipeline to measure wordpiece vocab metrics over a corpus.""" metrics_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( metrics_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (metrics_transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn(FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))) # Initialize CSV coder. Aggregate values for each lang, calculate metrics, # and write to output to a CSV file. csv_converter = tft.coders.CsvCoder(columns, csv_schema) _ = ( metrics_transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText( metrics_file, shard_name_template='', header=','.join(columns))) return metrics_pipeline
def setUp(self): super(CalculateMetricsTest, self).setUp() self.info_dict = { 'en': { 'count': 2, 'num_preserved_chars': 24, 'num_dropped_chars': 2, 'num_non_unk_wordpieces': 8, 'preserved_ratio': [2, 3], 'dropped_ratio': [0.5, 0], 'wordpieces': collections.Counter({ 'the': 2, 'le': 1, '##sson': 1, 'plan': 1, '##s': 1 }) }, 'fr': { 'count': 2, 'num_preserved_chars': 24, 'num_dropped_chars': 2, 'num_non_unk_wordpieces': 8, 'preserved_ratio': [5, 7], 'dropped_ratio': [0.4, 0.6], 'wordpieces': collections.Counter({ 'bon': 2, 'le': 2, 'jour': 1, 'soir': 1, 'homme': 1 }) } } self.metrics = utils.CalculateMetrics()
def calculate_metrics(): """Returns a pipeline to compute wordpiece model stats given a vocab and corpus.""" # Schema of input dataset. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) # Schema to format metrics as CSV. csv_schema = dataset_schema.from_feature_spec({ 'lang': tf.FixedLenFeature([], tf.string), 'sample_count': tf.FixedLenFeature([], tf.int64), 'micro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'macro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'micro_compress_ratio': tf.FixedLenFeature([], tf.string), 'macro_compress_ratio': tf.FixedLenFeature([], tf.string), 'unweighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), 'weighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), }) columns = [ 'lang', 'sample_count', 'micro_drop_char_percent', 'macro_drop_char_percent', 'micro_compress_ratio', 'macro_compress_ratio', 'unweighted_en_wp_overlap_percent', 'weighted_en_wp_overlap_percent' ] # Create pipeline. pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema, serialized=False) csv_converter = tft.coders.CsvCoder(columns, csv_schema) # Read raw data and convert to TF Transform encoded dict. raw_data = (pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( FLAGS.input_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn( FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key)) ) # Aggregate values for each lang, calculate metrics, and write to output. _ = (transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally( utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file, shard_name_template='', header=','.join(columns))) return pipeline