def testNGramsBagOfWordsEmpty(self): string_tensor = tf.constant([], dtype=tf.string) tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='') ngrams = mappers.ngrams(tokenized_tensor, (1, 2), separator='') bow = mappers.bag_of_words(tokenized_tensor, (1, 2), separator='') with tf.compat.v1.Session(): ngrams_output = ngrams.eval() bow_output = bow.eval() self.assertAllEqual(ngrams_output.values, []) self.assertAllEqual(bow_output.values, []) self.assertAllEqual(ngrams_output.dense_shape, [0, 0]) self.assertAllEqual(bow_output.dense_shape, [0, 0])
def testBagOfWords(self, strings, expected_output_indices, expected_output_values, ngram_range=(1, 1), separator=' '): string_tensor = tf.constant(strings, dtype=tf.string) tokenized_tensor = tf.compat.v1.string_split( string_tensor, delimiter=separator) output_tensor = mappers.bag_of_words( tokens=tokenized_tensor, ngram_range=ngram_range, separator=separator) with tf.compat.v1.Session(): output = output_tensor.eval() self.assertAllEqual(output.indices, expected_output_indices) self.assertAllEqual(output.values, expected_output_values)
def testBagOfWords(self, strings, expected_output_indices, expected_output_values, ngram_range=(1, 1), separator=' '): # TODO(b/141750093): Re-enable this test for MacOS. if sys.platform == 'darwin': self.skipTest( 'bag_of_words can produce unexpected results on macOS when there are ' 'empty rows, such as certain words overwritten with an empty string.') with tf.compat.v1.Graph().as_default(): string_tensor = tf.constant(strings, dtype=tf.string) tokenized_tensor = tf.compat.v1.string_split( string_tensor, delimiter=separator) output_tensor = mappers.bag_of_words( tokens=tokenized_tensor, ngram_range=ngram_range, separator=separator) with tf.compat.v1.Session(): output = output_tensor.eval() self.assertAllEqual(output.indices, expected_output_indices) self.assertAllEqual(output.values, expected_output_values)