def word_count(input_path, output_path, raw_metadata, min_token_frequency=2):
    """Returns a pipeline counting words and writing the output.

  Args:
    input_path: recordio file to read
    output_path: path in which to write the output
    raw_metadata: metadata of input tf.Examples
    min_token_frequency: the min frequency for a token to be included
  """

    lang_set = set(FLAGS.lang_set.split(','))

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                 serialized=False)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (
            pipeline
            | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                input_path, coder=beam.coders.ProtoCoder(tf.train.Example))
            | 'DecodeInputData' >> beam.Map(converter.decode))

        # Apply TF Transform.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            |
            'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
                utils.count_preprocessing_fn(FLAGS.text_key,
                                             FLAGS.language_code_key)))

        # Filter by languages.
        tokens = (
            transformed_data
            | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

        # Calculate smoothing coefficients.
        coeffs = (tokens
                  | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                      utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

        # Apply smoothing, aggregate counts, and sort words by count.
        _ = (tokens
             | 'ApplyExponentialSmoothing' >> beam.ParDo(
                 utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
             | 'SumCounts' >> beam.CombinePerKey(sum)
             | 'FilterLowCounts' >> beam.ParDo(
                 utils.FilterByCount(FLAGS.max_word_length,
                                     min_token_frequency))
             |
             'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
             | 'Flatten' >> beam.FlatMap(lambda x: x)
             | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1]))
             | 'WriteSortedCount' >> beam.io.WriteToText(
                 output_path, shard_name_template=''))

    return pipeline
Exemple #2
0
    def run_vocab():
        """Creates a pipeline to generate wordpiece vocab over a corpus."""

        vocab_pipeline = beam.Pipeline()

        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Read raw data and convert to TF Transform encoded dict.
            raw_data = (
                vocab_pipeline
                | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                    data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
                | 'DecodeInputData' >> beam.Map(example_converter.decode))

            # Apply TF Transform.
            (transformed_data,
             _), _ = ((raw_data, raw_metadata)
                      | 'FilterLangAndExtractToken' >>
                      tft_beam.AnalyzeAndTransformDataset(
                          utils.count_preprocessing_fn(
                              FLAGS.text_key, FLAGS.language_code_key)))

            # Filter by languages.
            tokens = (transformed_data
                      | 'FilterByLang' >> beam.ParDo(
                          utils.FilterTokensByLang(lang_set)))

            # Calculate smoothing coefficients.
            coeffs = (
                tokens
                | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                    utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

            # Apply smoothing, aggregate counts, and sort words by count.
            _ = (tokens
                 | 'ApplyExponentialSmoothing' >> beam.ParDo(
                     utils.ExponentialSmoothing(),
                     beam.pvalue.AsSingleton(coeffs))
                 | 'SumCounts' >> beam.CombinePerKey(sum)
                 | 'FilterLowCounts' >> beam.ParDo(
                     utils.FilterByCount(FLAGS.max_word_length,
                                         min_token_frequency))
                 | 'MergeAndSortCounts' >> beam.CombineGlobally(
                     utils.SortByCount())
                 | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params))
                 | 'Flatten' >> beam.FlatMap(lambda x: x + '\n')
                 | 'WriteVocab' >> beam.io.WriteToText(
                     vocab_file,
                     shard_name_template='',
                     append_trailing_newlines=False))
        return vocab_pipeline
    def testUseGivenLang(self):
        preprocessing_fn = utils.count_preprocessing_fn(
            'text', 'language_code')
        with tf.Session() as sess:
            expected_tokens = [
                'Let', '\'', 's', 'make', 'this', 'Chinese', 'even', 'though',
                'it', '\'', 's', 'English', '.'
            ]

            outputs = preprocessing_fn(self.raw_data)
            outputs = sess.run(outputs)
            self.assertEqual(outputs['lang'], 'zh')
            self.assertSequenceAlmostEqual(outputs['tokens'].values,
                                           expected_tokens)