Exemple #1
0
def document_features(documents, word_mean, word_std, ngram_minn, ngram_maxn):
    words = split_words(documents, stop=True)

    length = length_features(words, word_mean, word_std)
    no_case, lower_case, upper_case, title_case, mixed_case = case_features(
        words)
    ngrams = ngram_features(words, ngram_minn, ngram_maxn)

    return {
        'document': documents,
        # 'words': tf.sparse.to_dense(words, default_value=''),  # Required to pass in prediction
        'word_ngrams': ngrams,
        'word_length': length,
        'word_nocase': no_case,
        'word_lower': lower_case,
        'word_upper': upper_case,
        'word_title': title_case,
        'word_mixed': mixed_case,
    }
    def test_space(self):
        sure_spaces = [
            '\t', '\n', '\x0b', '\x0c', '\r', '\x1c', '\x1d', '\x1e', '\x1f',
            ' ', '\x85', '\xa0', '\u1680', '\u2000', '\u2001', '\u2002',
            '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008',
            '\u2009', '\u200a', '\u2028', '\u2029', '\u200b', '\u202f',
            '\u205f', '\u2060', '\u2061', '\u2800', '\u3000', '\ufeff'
        ]
        source = split_words(
            [' {}W{}W{} {} '.format(s, s, s, s) for s in sure_spaces],
            extended=True)

        tokens, spaces = spaces_after(source)
        tokens, spaces = self.evaluate(
            [tokens.to_tensor(''), spaces.to_tensor('')])

        self.assertAllEqual([[b'W', b'W']] * 34, tokens.tolist())
        self.assertAllEqual([[s, '{} {} '.format(s, s)] for s in sure_spaces],
                            spaces.tolist())
    def test_extended_lr(self):
        expected = tf.constant([
            ['A', ' ', '\u200e', 'B'],
            ['A\u200eB', '', '', ''],
            ['A', ' ', '\xad', 'B'],
            ['A\xadB', '', '', ''],
            ['A', ' ', '\ufe0f', 'B'],
            ['A\ufe0fB', '', '', ''],
        ],
                               dtype=tf.string)
        result = split_words([
            'A \u200eB', 'A\u200eB', 'A \xadB', 'A\xadB', 'A \ufe0fB',
            'A\ufe0fB'
        ],
                             extended=True)
        self.assertIsInstance(result, tf.RaggedTensor)
        result = result.to_tensor(default_value='')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
    def test_complex_word(self):
        expected = tf.constant([
            [' ', 'test', '@', 'test.com', ' ', '', '', '', ''],
            [' ', 'www.test.com', ' ', '', '', '', '', '', ''],
            [' ', 'word', '.', '.', 'word', ' ', '', '', ''],
            [' ', 'word', '+', 'word', '-', 'word', ' ', '', ''],
            [' ', 'word', '\\', 'word', '/', 'word', '#', 'word', ' '],
        ])
        result = split_words([
            ' [email protected] ',
            ' www.test.com ',
            ' word..word ',
            ' word+word-word ',
            ' word\\word/word#word ',
        ])
        self.assertIsInstance(result, tf.RaggedTensor)
        result = result.to_tensor(default_value='')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
    def test_wrapped(self):
        expected = tf.constant([
            [' ', '"', 'word', '"', ' '],
            [' ', u'«', 'word', u'»', ' '],
            [' ', u'„', 'word', u'“', ' '],
            [' ', '{', 'word', '}', ' '],
            [' ', '(', 'word', ')', ' '],
            [' ', '[', 'word', ']', ' '],
            [' ', '<', 'word', '>', ' '],
        ])
        result = split_words([
            ' "word" ',
            u' «word» ',
            u' „word“ ',
            ' {word} ',
            ' (word) ',
            ' [word] ',
            ' <word> ',
        ])
        self.assertIsInstance(result, tf.RaggedTensor)
        result = result.to_tensor(default_value='')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
 def test_complex_extended_case(self):
     expected = tf.constant(
         ['Word', '.', 'W', '.', 'O', '.', ' ', 'rd', '.'])
     result = split_words('Word.W.O. rd.', extended=True)
     expected, result = self.evaluate([expected, result])
     self.assertAllEqual(expected, result)
    def test_complex_extended(self):
        dangerous = u'\',.:;‘’'
        source = []
        for c in dangerous:
            source.append(u' {}00 '.format(c))  # before number
            source.append(u' {}zz '.format(c))  # before letter
            source.append(u' 00{}00 '.format(c))  # inside numbers
            source.append(u' zz{}zz '.format(c))  # inside letters
            source.append(u' 00{} '.format(c))  # after number
            source.append(u' zz{} '.format(c))  # after letter
        expected = tf.constant([
            [' ', u'\'', '00', ' ', ''],
            [' ', u'\'', 'zz', ' ', ''],
            [' ', '00', u'\'', '00', ' '],
            [' ', 'zz', u'\'', 'zz', ' '],
            [' ', '00', u'\'', ' ', ''],
            [' ', 'zz', u'\'', ' ', ''],
            [' ', u',', '00', ' ', ''],
            [' ', u',', 'zz', ' ', ''],
            [' ', '00', u',', '00', ' '],
            [' ', 'zz', u',', 'zz', ' '],
            [' ', '00', u',', ' ', ''],
            [' ', 'zz', u',', ' ', ''],
            [' ', u'.', '00', ' ', ''],
            [' ', u'.', 'zz', ' ', ''],
            [' ', '00', u'.', '00', ' '],
            [' ', 'zz', u'.', 'zz', ' '],
            [' ', '00', u'.', ' ', ''],
            [' ', 'zz', u'.', ' ', ''],
            [' ', u':', '00', ' ', ''],
            [' ', u':', 'zz', ' ', ''],
            [' ', '00', u':', '00', ' '],
            [' ', 'zz', u':', 'zz', ' '],
            [' ', '00', u':', ' ', ''],
            [' ', 'zz', u':', ' ', ''],
            [' ', u';', '00', ' ', ''],
            [' ', u';', 'zz', ' ', ''],
            [' ', '00', u';', '00', ' '],
            [' ', 'zz', u';', 'zz', ' '],
            [' ', '00', u';', ' ', ''],
            [' ', 'zz', u';', ' ', ''],
            [' ', u'‘', '00', ' ', ''],
            [' ', u'‘', 'zz', ' ', ''],
            [' ', '00', u'‘', '00', ' '],
            [' ', 'zz', u'‘', 'zz', ' '],
            [' ', '00', u'‘', ' ', ''],
            [' ', 'zz', u'‘', ' ', ''],
            [' ', u'’', '00', ' ', ''],
            [' ', u'’', 'zz', ' ', ''],
            [' ', '00', u'’', '00', ' '],
            [' ', 'zz', u'’', 'zz', ' '],
            [' ', '00', u'’', ' ', ''],
            [' ', 'zz', u'’', ' ', ''],
        ],
                               dtype=tf.string)

        result = split_words(source, extended=True)
        self.assertIsInstance(result, tf.RaggedTensor)
        result = result.to_tensor(default_value='')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)