def _split(self, text, removed_words=()):
        """Split unicode text into tuple of unicode terms

        @param text: unicode text to split
        @param remove_words: words to remove from the split result"""

        return tuple([x for x in text2words(text) if len(x) > 1 and x not in removed_words])
 def test_text2words(self):
     self.assertEqual(text2words('x'), ('x',))
     self.assertEqual(text2words('x Y'), ('x', 'Y'))
     self.assertEqual(text2words('foo bar'), ('foo', 'bar'))
     self.assertEqual(text2words('<p>foo bar</p>'), ('foo', 'bar'))
     self.assertEqual(text2words('foo<sub>bar</sub>'), ('foo', 'bar'))
     self.assertEqual(text2words('<p class="shiny">text</p>'), ('text',))
     self.assertEqual(text2words('<p \n>text</\np>'), ('text',))
     self.assertEqual(text2words('<br/>  <br />'), ())
Example #3
0
 def test_text2words(self):
     self.assertEqual(text2words('x'), ('x', ))
     self.assertEqual(text2words('x Y'), ('x', 'Y'))
     self.assertEqual(text2words('foo bar'), ('foo', 'bar'))
     self.assertEqual(text2words('<p>foo bar</p>'), ('foo', 'bar'))
     self.assertEqual(text2words('foo<sub>bar</sub>'), ('foo', 'bar'))
     self.assertEqual(text2words('<p class="shiny">text</p>'), ('text', ))
     self.assertEqual(text2words('<p \n>text</\np>'), ('text', ))
     self.assertEqual(text2words('<br/>  <br />'), ())
Example #4
0
    def _split(self, text, removed_words=()):
        """Split unicode text into tuple of unicode terms

        @param text: unicode text to split
        @param remove_words: words to remove from the split result"""

        return tuple([
            x for x in text2words(text)
            if len(x) > 1 and x not in removed_words
        ])