class DocumentParser(object): def __init__(self, language): self._tokenizer = Tokenizer(language) self.language = language def tokenize_sentences(self, text): return self._tokenizer.to_sentences(text) def tokenize_words(self, sentence): return self._tokenizer.to_words(sentence)
def test_magnitude(self): tokenizer_en = Tokenizer("english") tokenizer_cn = Tokenizer("chinese") text_en = "i am very happy" text_cn = "我来到北京清华大学" model_en = TfDocumentModel(text_en, tokenizer_en) model_cn = TfDocumentModel(text_cn, tokenizer_cn) self.assertAlmostEqual(model_en.magnitude, 2.0) self.assertAlmostEqual(model_cn.magnitude, 2.0)
def build_document_from_string(string="", language="english"): sentences = [] paragraphs = [] tokenizer = Tokenizer(language) for line in string.strip().splitlines(): line = line.lstrip() if line: sentence_tuple = tokenizer.to_sentences(line) for sentence in sentence_tuple: sentences.append(build_sentence(sentence, language)) paragraphs.append(Paragraph(sentences)) sentences = [] else: continue return Document(paragraphs)
def test_terms(self): tokenizer = Tokenizer("english") text = "wA wB wC wD wB wD wE" model = TfDocumentModel(text, tokenizer) terms = tuple(sorted(model.terms)) self.assertEqual(terms, ("wa", "wb", "wc", "wd", "we"))
def test_ensure_tokenizer_available(self): tokenizer_en = Tokenizer("english") self.assertEqual("english", tokenizer_en.language) tokenizer_cn = Tokenizer("chinese") self.assertEqual("chinese", tokenizer_cn.language) sentence_en = "You are very beautiful." sentence_cn = "我来到北京清华大学" expected_en_words = ( "You", "are", "very", "beautiful", ) expected_cn_words = ( "我", "来到", "北京", "清华大学", ) self.assertEqual(expected_en_words, tokenizer_en.to_words(sentence_en)) self.assertEqual(expected_cn_words, tokenizer_cn.to_words(sentence_cn))
def test_most_frequent_terms(self): tokenizer = Tokenizer("english") text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE" model = TfDocumentModel(text, tokenizer) self.assertEqual(model.most_frequent_terms(1), ("we", )) self.assertEqual(model.most_frequent_terms(2), ("we", "wd")) self.assertEqual(model.most_frequent_terms(3), ("we", "wd", "wc")) self.assertEqual(model.most_frequent_terms(4), ("we", "wd", "wc", "wb")) self.assertEqual(model.most_frequent_terms(5), ("we", "wd", "wc", "wb", "wa")) self.assertEqual(model.most_frequent_terms(), ("we", "wd", "wc", "wb", "wa"))
def test_ensure_segment_available(self): tokenizer_en = Tokenizer("english") tokenizer_cn = Tokenizer("chinese") sentence_en = "There are two sentences here. Am I right?" sentence_cn = "这里貌似有两句话。不知道我说的对不对。" self.assertEqual(len(tokenizer_en.to_sentences(sentence_en)), 2) self.assertEqual(len(tokenizer_cn.to_sentences(sentence_cn)), 2)
def test_term_frequency(self): tokenizer = Tokenizer("english") text = "wA wB wC wA wA wC wD wCwB" model = TfDocumentModel(text, tokenizer) self.assertEqual(model.term_frequency("wa"), 3) self.assertEqual(model.term_frequency("wb"), 1) self.assertEqual(model.term_frequency("wc"), 2) self.assertEqual(model.term_frequency("wd"), 1) self.assertEqual(model.term_frequency("wcwb"), 1) self.assertEqual(model.term_frequency("we"), 0) self.assertEqual(model.term_frequency("missing"), 0) self.assertAlmostEqual(model.term_frequency_normalized("missing"), 0) self.assertAlmostEqual(model.term_frequency_normalized("wa", "max"), 1.0) self.assertAlmostEqual(model.term_frequency_normalized("wa"), 3 / 8)
def test_sentence_equal(self): sentence_one = Sentence("", Tokenizer("english")) sentence_two = Sentence("", Tokenizer("english")) self.assertEqual(sentence_one, sentence_two) sentence_one = Sentence("another example", Tokenizer("english")) sentence_two = Sentence("another example", Tokenizer("english")) self.assertEqual(sentence_one, sentence_two) sentence_one = Sentence("example", Tokenizer("english")) sentence_two = Sentence("another", Tokenizer("english")) self.assertNotEqual(sentence_one, sentence_two)
def __init__(self, language): self._tokenizer = Tokenizer(language) self.language = language
def build_sentence(sentence_as_string="", language="english"): tokenizer = Tokenizer(language) return Sentence(sentence_as_string, tokenizer)
def test_most_frequent_terms_negative_count(self): tokenizer = Tokenizer("english") model = TfDocumentModel("text", tokenizer) self.assertRaises(ValueError, model.most_frequent_terms, -1)
def test_most_frequent_terms_empty(self): tokenizer = Tokenizer("english") model = TfDocumentModel("", tokenizer) self.assertEqual(model.most_frequent_terms(), ()) self.assertEqual(model.most_frequent_terms(10), ())