def test_tokenize(self): train = "\n".join(itertools.imap(strip_tags, itertools.chain(*(speech['text'] for speech in self.speeches[0:10])))) print train tokenizer = PunktSentenceTokenizer(train) sents = tokenizer.tokenize(strip_tags(self.speeches[0]['text'][0])) sents = tokenize_sents(strip_tags(self.speeches[0]['text'][0])) self.assertEqual(len(sents), 3)
def xtest_strip_tags(self): text = strip_tags(self.speeches[0]['text'][0]) self.assertEqual(text, "I do not have that figure to hand, but I am happy to let the hon. Gentleman have it after the debate. Of course, we have a structured system that ensures that the commission has the overall supervision of complaints, which I will come to, and that it deals directly with the most serious complaints. That is as it should be.")