Esempio n. 1
0
    def test_tokenize(self):

        train = "\n".join(itertools.imap(strip_tags, itertools.chain(*(speech['text'] for speech in self.speeches[0:10]))))

        print train
        tokenizer = PunktSentenceTokenizer(train)



        sents = tokenizer.tokenize(strip_tags(self.speeches[0]['text'][0]))

        sents = tokenize_sents(strip_tags(self.speeches[0]['text'][0]))

        self.assertEqual(len(sents), 3)
Esempio n. 2
0
 def xtest_strip_tags(self):
     text = strip_tags(self.speeches[0]['text'][0])
     self.assertEqual(text, "I do not have that figure to hand, but I am happy to let the hon. Gentleman have it after the debate. Of course, we have a structured system that ensures that the commission has the overall supervision of complaints, which I will come to, and that it deals directly with the most serious complaints. That is as it should be.")