def word_count(input_path, output_path, raw_metadata, min_token_frequency=2): """Returns a pipeline counting words and writing the output. Args: input_path: recordio file to read output_path: path in which to write the output raw_metadata: metadata of input tf.Examples min_token_frequency: the min frequency for a token to be included """ lang_set = set(FLAGS.lang_set.split(',')) # Create pipeline. pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): converter = tft.coders.ExampleProtoCoder(raw_metadata.schema, serialized=False) # Read raw data and convert to TF Transform encoded dict. raw_data = ( pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( input_path, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(converter.decode)) # Apply TF Transform. (transformed_data, _), _ = ( (raw_data, raw_metadata) | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset( utils.count_preprocessing_fn(FLAGS.text_key, FLAGS.language_code_key))) # Filter by languages. tokens = ( transformed_data | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set))) # Calculate smoothing coefficients. coeffs = (tokens | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally( utils.CalculateCoefficients(FLAGS.smoothing_exponent))) # Apply smoothing, aggregate counts, and sort words by count. _ = (tokens | 'ApplyExponentialSmoothing' >> beam.ParDo( utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs)) | 'SumCounts' >> beam.CombinePerKey(sum) | 'FilterLowCounts' >> beam.ParDo( utils.FilterByCount(FLAGS.max_word_length, min_token_frequency)) | 'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount()) | 'Flatten' >> beam.FlatMap(lambda x: x) | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1])) | 'WriteSortedCount' >> beam.io.WriteToText( output_path, shard_name_template='')) return pipeline
def testNotEqual(self): with TestPipeline() as p: sample_input = [('I', 'en'), ('kind', 'en'), ('of', 'en'), ('like', 'en'), ('to', 'en'), ('eat', 'en'), ('pie', 'en'), ('!', 'en'), ('Je', 'fr'), ('suis', 'fr'), ('une', 'fr'), ('fille', 'fr'), ('.', 'fr')] tokens = p | beam.Create(sample_input) result = (tokens | beam.CombineGlobally(utils.CalculateCoefficients(0.5)) | beam.ParDo(CompareValues())) assert_that(result, equal_to([True]))
def run_vocab(): """Creates a pipeline to generate wordpiece vocab over a corpus.""" vocab_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( vocab_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply TF Transform. (transformed_data, _), _ = ((raw_data, raw_metadata) | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset( utils.count_preprocessing_fn( FLAGS.text_key, FLAGS.language_code_key))) # Filter by languages. tokens = (transformed_data | 'FilterByLang' >> beam.ParDo( utils.FilterTokensByLang(lang_set))) # Calculate smoothing coefficients. coeffs = ( tokens | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally( utils.CalculateCoefficients(FLAGS.smoothing_exponent))) # Apply smoothing, aggregate counts, and sort words by count. _ = (tokens | 'ApplyExponentialSmoothing' >> beam.ParDo( utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs)) | 'SumCounts' >> beam.CombinePerKey(sum) | 'FilterLowCounts' >> beam.ParDo( utils.FilterByCount(FLAGS.max_word_length, min_token_frequency)) | 'MergeAndSortCounts' >> beam.CombineGlobally( utils.SortByCount()) | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params)) | 'Flatten' >> beam.FlatMap(lambda x: x + '\n') | 'WriteVocab' >> beam.io.WriteToText( vocab_file, shard_name_template='', append_trailing_newlines=False)) return vocab_pipeline
def testEqual(self): with TestPipeline() as p: tokens = p | beam.Create(self.sample_input) result = tokens | beam.CombineGlobally(utils.CalculateCoefficients(0.5)) assert_that(result, equal_to([{'en': 1.0, 'fr': 1.0}]))