Python AggregateLang Examples

Programming Language: Python

Namespace/Package Name: wordpiece_vocab.utils

Method/Function: AggregateLang

Examples at hotexamples.com: 3

Python AggregateLang - 3 examples found. These are the top rated real world Python examples of wordpiece_vocab.utils.AggregateLang extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: generate_vocab.py Project: fadimasoud/text

  def run_metrics():
    """Creates a pipeline to measure wordpiece vocab metrics over a corpus."""

    metrics_pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
      # Read raw data and convert to TF Transform encoded dict.
      raw_data = (
          metrics_pipeline
          | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
              data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
          | 'DecodeInputData' >> beam.Map(example_converter.decode))

      # Apply transform to wordpiece-tokenize input.
      (metrics_transformed_data, _), _ = (
          (raw_data, raw_metadata)
          | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
              utils.metrics_preprocessing_fn(FLAGS.vocab_file,
                                             FLAGS.text_key,
                                             FLAGS.language_code_key)))

      # Initialize CSV coder. Aggregate values for each lang, calculate metrics,
      # and write to output to a CSV file.
      csv_converter = tft.coders.CsvCoder(columns, csv_schema)
      _ = (
          metrics_transformed_data
          | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
          | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang())
          | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
          | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
          | 'WriteMetrics' >> beam.io.WriteToText(
              metrics_file, shard_name_template='', header=','.join(columns)))
    return metrics_pipeline

Example #2

Show file

 def setUp(self):
     super(AggregateLangTest, self).setUp()
     self.aggregator = utils.AggregateLang()
     self.sample_input = [{
         'lang':
         'en',
         'count':
         1,
         'num_preserved_chars':
         13,
         'num_dropped_chars':
         2,
         'num_non_unk_wordpieces':
         4,
         'preserved_ratio': [13 / 4],
         'dropped_ratio': [2 / 15],
         'wordpieces':
         collections.Counter(['the', 'app', '##le', 'sauce'])
     }, {
         'lang':
         'en',
         'count':
         1,
         'num_preserved_chars':
         11,
         'num_dropped_chars':
         0,
         'num_non_unk_wordpieces':
         4,
         'preserved_ratio': [11 / 4],
         'dropped_ratio': [0],
         'wordpieces':
         collections.Counter(['the', 'app', 'st', '##ore'])
     }]

Example #3

Show file

File: measure_wordpiece_stats.py Project: isabella232/text-1

def calculate_metrics():
    """Returns a pipeline to compute wordpiece model stats given a vocab and corpus."""

    # Schema of input dataset.
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'text':
            tf.FixedLenFeature([], tf.string),
            'language_code':
            tf.FixedLenFeature([], tf.string),
        }))

    # Schema to format metrics as CSV.
    csv_schema = dataset_schema.from_feature_spec({
        'lang':
        tf.FixedLenFeature([], tf.string),
        'sample_count':
        tf.FixedLenFeature([], tf.int64),
        'micro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'macro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'micro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'macro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'unweighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
        'weighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
    })

    columns = [
        'lang', 'sample_count', 'micro_drop_char_percent',
        'macro_drop_char_percent', 'micro_compress_ratio',
        'macro_compress_ratio', 'unweighted_en_wp_overlap_percent',
        'weighted_en_wp_overlap_percent'
    ]

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                         serialized=False)
        csv_converter = tft.coders.CsvCoder(columns, csv_schema)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (pipeline
                    | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                        FLAGS.input_file,
                        coder=beam.coders.ProtoCoder(tf.train.Example))
                    | 'DecodeInputData' >> beam.Map(example_converter.decode))

        # Apply transform to wordpiece-tokenize input.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
                utils.metrics_preprocessing_fn(
                    FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))
        )

        # Aggregate values for each lang, calculate metrics, and write to output.
        _ = (transformed_data
             |
             'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
             | 'CombineStatsForLang' >> beam.CombineGlobally(
                 utils.AggregateLang())
             | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
             | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
             | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file,
                                                     shard_name_template='',
                                                     header=','.join(columns)))

    return pipeline