Exemple #1
0
  def run_metrics():
    """Creates a pipeline to measure wordpiece vocab metrics over a corpus."""

    metrics_pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
      # Read raw data and convert to TF Transform encoded dict.
      raw_data = (
          metrics_pipeline
          | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
              data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
          | 'DecodeInputData' >> beam.Map(example_converter.decode))

      # Apply transform to wordpiece-tokenize input.
      (metrics_transformed_data, _), _ = (
          (raw_data, raw_metadata)
          | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
              utils.metrics_preprocessing_fn(FLAGS.vocab_file,
                                             FLAGS.text_key,
                                             FLAGS.language_code_key)))

      # Initialize CSV coder. Aggregate values for each lang, calculate metrics,
      # and write to output to a CSV file.
      csv_converter = tft.coders.CsvCoder(columns, csv_schema)
      _ = (
          metrics_transformed_data
          | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
          | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang())
          | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
          | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
          | 'WriteMetrics' >> beam.io.WriteToText(
              metrics_file, shard_name_template='', header=','.join(columns)))
    return metrics_pipeline
Exemple #2
0
 def testTwoLangs(self):
   with TestPipeline() as p:
     tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
     result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
     assert_that(result, equal_to([{
         'lang': 'en',
         'count': 1,
         'num_preserved_chars': 13,
         'num_dropped_chars': 2,
         'num_non_unk_wordpieces': 4,
         'preserved_ratio': [13/4],
         'dropped_ratio': [2/15],
         'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
     }, {
         'lang': 'fr',
         'count': 1,
         'num_preserved_chars': 14,
         'num_dropped_chars': 0,
         'num_non_unk_wordpieces': 5,
         'preserved_ratio': [14/5],
         'dropped_ratio': [0],
         'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
     }]))
def calculate_metrics():
    """Returns a pipeline to compute wordpiece model stats given a vocab and corpus."""

    # Schema of input dataset.
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'text':
            tf.FixedLenFeature([], tf.string),
            'language_code':
            tf.FixedLenFeature([], tf.string),
        }))

    # Schema to format metrics as CSV.
    csv_schema = dataset_schema.from_feature_spec({
        'lang':
        tf.FixedLenFeature([], tf.string),
        'sample_count':
        tf.FixedLenFeature([], tf.int64),
        'micro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'macro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'micro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'macro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'unweighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
        'weighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
    })

    columns = [
        'lang', 'sample_count', 'micro_drop_char_percent',
        'macro_drop_char_percent', 'micro_compress_ratio',
        'macro_compress_ratio', 'unweighted_en_wp_overlap_percent',
        'weighted_en_wp_overlap_percent'
    ]

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                         serialized=False)
        csv_converter = tft.coders.CsvCoder(columns, csv_schema)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (pipeline
                    | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                        FLAGS.input_file,
                        coder=beam.coders.ProtoCoder(tf.train.Example))
                    | 'DecodeInputData' >> beam.Map(example_converter.decode))

        # Apply transform to wordpiece-tokenize input.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
                utils.metrics_preprocessing_fn(
                    FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))
        )

        # Aggregate values for each lang, calculate metrics, and write to output.
        _ = (transformed_data
             |
             'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
             | 'CombineStatsForLang' >> beam.CombineGlobally(
                 utils.AggregateLang())
             | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
             | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
             | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file,
                                                     shard_name_template='',
                                                     header=','.join(columns)))

    return pipeline