Beispiel #1
0
def test_ngram():
    texts = [
        'The cat sat on the mat.', 'The dog sat on the log.',
        'Dogs and cats living together.'
    ]
    tokenize = preprocessor.TextToNgramVector()
    dataset = tf.data.Dataset.from_tensor_slices(texts)
    tokenize.set_hp(kerastuner.HyperParameters())
    for x in dataset:
        tokenize.update(x)
    tokenize.finalize()
    tokenize.set_config(tokenize.get_config())

    weights = tokenize.get_weights()
    tokenize.clear_weights()
    tokenize.set_weights(weights)

    for a in dataset:
        tokenize.transform(a)

    def map_func(x):
        return tf.py_function(tokenize.transform, inp=[x], Tout=(tf.float64, ))

    new_dataset = dataset.map(map_func)
    for _ in new_dataset:
        pass
    assert isinstance(new_dataset, tf.data.Dataset)
def test_ngram():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    dataset = tf.data.Dataset.from_tensor_slices(texts)
    new_dataset = run_preprocessor(preprocessor.TextToNgramVector(),
                                   dataset,
                                   common.generate_data(dtype='dataset'),
                                   tf.float32)
    assert isinstance(new_dataset, tf.data.Dataset)
Beispiel #3
0
 def build(self, hp, inputs=None):
     input_node = nest.flatten(inputs)[0]
     output_node = input_node
     vectorizer = self.vectorizer or hp.Choice(
         'vectorizer', ['sequence', 'ngram'], default='sequence')
     if not isinstance(input_node, node.TextNode):
         raise ValueError('The input_node should be a TextNode.')
     if vectorizer == 'ngram':
         output_node = preprocessor.TextToNgramVector()(output_node)
         output_node = block.DenseBlock()(output_node)
     else:
         output_node = preprocessor.TextToIntSequence()(output_node)
         output_node = block.EmbeddingBlock(
             pretraining=self.pretraining)(output_node)
         output_node = block.ConvBlock(separable=True)(output_node)
     return output_node
def test_ngram_result():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    sklearn_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    sklearn_vectorizer.fit(texts)
    sklearn_vec = sklearn_vectorizer.transform([texts[0]]).toarray()[0]

    dataset = tf.data.Dataset.from_tensor_slices(texts)
    ngram_vectorizer = preprocessor_module.TextToNgramVector(ngram_range=(1, 2))
    ngram_vectorizer.build(kerastuner.HyperParameters())
    for text in dataset:
        ngram_vectorizer.update([text])
    ngram_vectorizer.finalize()
    ngram_vec = []
    for text in dataset:
        ngram_vec.append(ngram_vectorizer.transform([text]))

    assert len(ngram_vectorizer.vocabulary) == len(sklearn_vectorizer.vocabulary_)
    for key in ngram_vectorizer.vocabulary.keys():
        assert key in sklearn_vectorizer.vocabulary_
        assert (sklearn_vec[sklearn_vectorizer.vocabulary_[key]] -
                ngram_vec[0][ngram_vectorizer.vocabulary[key]] <= 0.001)