Ejemplo n.º 1
0
  def testLargerBatchSize(self):
    with tf.Session() as sess:
      with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as vocab:
        raw_data = {
            'label': ['1', '2'],
            'text_a': ['The boy jumped into the air.', 'The cat sat on a hat.'],
            'lang': ['en', 'en'],
        }
        expected_wordpieces = ['The', '[UNK]', 'jumped', 'in', '##to', 'the',
                               'air', '.', 'The', 'cat', 'sat', 'on', 'a', 'h',
                               '##at', '.']
        vocab.writelines([word + '\n' for word in self.vocab])
        vocab.flush()
        preprocessing_fn = utils.metrics_preprocessing_fn(
            vocab.name, 'text_a', 'lang')
        outputs = preprocessing_fn(raw_data)
        tf.tables_initializer().run()
        outputs = sess.run(outputs)

        self.assertSequenceAlmostEqual(outputs['lang'], ['en', 'en'])
        self.assertSequenceAlmostEqual(outputs['num_preserved_chars'], [20, 16])
        self.assertSequenceAlmostEqual(outputs['num_dropped_chars'], [3, 0])
        self.assertSequenceAlmostEqual(outputs['wordpieces'].values,
                                       expected_wordpieces)
        self.assertSequenceAlmostEqual(outputs['num_non_unk_wordpieces'],
                                       [7, 8])
Ejemplo n.º 2
0
  def run_metrics():
    """Creates a pipeline to measure wordpiece vocab metrics over a corpus."""

    metrics_pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
      # Read raw data and convert to TF Transform encoded dict.
      raw_data = (
          metrics_pipeline
          | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
              data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
          | 'DecodeInputData' >> beam.Map(example_converter.decode))

      # Apply transform to wordpiece-tokenize input.
      (metrics_transformed_data, _), _ = (
          (raw_data, raw_metadata)
          | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
              utils.metrics_preprocessing_fn(FLAGS.vocab_file,
                                             FLAGS.text_key,
                                             FLAGS.language_code_key)))

      # Initialize CSV coder. Aggregate values for each lang, calculate metrics,
      # and write to output to a CSV file.
      csv_converter = tft.coders.CsvCoder(columns, csv_schema)
      _ = (
          metrics_transformed_data
          | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
          | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang())
          | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
          | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
          | 'WriteMetrics' >> beam.io.WriteToText(
              metrics_file, shard_name_template='', header=','.join(columns)))
    return metrics_pipeline
Ejemplo n.º 3
0
  def testSingleElement(self):
    with tf.Session() as sess:
      with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as vocab:
        vocab.writelines([word + '\n' for word in self.vocab])
        vocab.flush()
        preprocessing_fn = utils.metrics_preprocessing_fn(
            vocab.name, 'text_a', 'lang')
        outputs = preprocessing_fn(self.raw_data)
        tf.tables_initializer().run()
        outputs = sess.run(outputs)

        self.assertEqual(outputs['lang'], 'en')
        self.assertEqual(outputs['num_non_unk_wordpieces'], 7)
        self.assertEqual(outputs['num_preserved_chars'], 20)
        self.assertEqual(outputs['num_dropped_chars'], 3)
        self.assertSequenceAlmostEqual(outputs['wordpieces'].values,
                                       self.expected_wordpieces)
Ejemplo n.º 4
0
def calculate_metrics():
    """Returns a pipeline to compute wordpiece model stats given a vocab and corpus."""

    # Schema of input dataset.
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'text':
            tf.FixedLenFeature([], tf.string),
            'language_code':
            tf.FixedLenFeature([], tf.string),
        }))

    # Schema to format metrics as CSV.
    csv_schema = dataset_schema.from_feature_spec({
        'lang':
        tf.FixedLenFeature([], tf.string),
        'sample_count':
        tf.FixedLenFeature([], tf.int64),
        'micro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'macro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'micro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'macro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'unweighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
        'weighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
    })

    columns = [
        'lang', 'sample_count', 'micro_drop_char_percent',
        'macro_drop_char_percent', 'micro_compress_ratio',
        'macro_compress_ratio', 'unweighted_en_wp_overlap_percent',
        'weighted_en_wp_overlap_percent'
    ]

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                         serialized=False)
        csv_converter = tft.coders.CsvCoder(columns, csv_schema)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (pipeline
                    | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                        FLAGS.input_file,
                        coder=beam.coders.ProtoCoder(tf.train.Example))
                    | 'DecodeInputData' >> beam.Map(example_converter.decode))

        # Apply transform to wordpiece-tokenize input.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
                utils.metrics_preprocessing_fn(
                    FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))
        )

        # Aggregate values for each lang, calculate metrics, and write to output.
        _ = (transformed_data
             |
             'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
             | 'CombineStatsForLang' >> beam.CombineGlobally(
                 utils.AggregateLang())
             | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
             | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
             | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file,
                                                     shard_name_template='',
                                                     header=','.join(columns)))

    return pipeline