def test_remove(self): import random d = self.get_documents() doc = Document.load_docs(d) confirmation_id = len(doc.vocab) - 1 def get_info(target_id): v = doc.vocab[target_id] f = doc.frequency[target_id] c = len([w for w in [a for a in doc.archives] if w[0] == target_id]) return (v, f, c) removed = ["食材"] old_v = get_info(confirmation_id) i = doc.remove_vocab(removed[0]) target = random.sample(list(doc.vocab.keys())[:-2], 1)[0] removed += [doc.vocab[target].surface] doc.remove_vocab(target) new_id = doc.get_id(old_v[0].surface) new_v = get_info(new_id) self.assertEqual(new_v[0].surface, old_v[0].surface) self.assertEqual(new_v[1], old_v[1]) self.assertEqual(new_v[2], old_v[2]) for r in removed: self.assertEqual(-1, doc.get_id(r))
def test_cut_tail(self): d = self.get_documents() doc = Document.load_docs(d) freq = 1 remained = [f for f in doc.frequency.items() if f[1] > freq] doc.cut_under(freq) self.assertEqual(len(remained), len(doc.vocab.keys())) self.assertEqual(len(remained), len(doc.frequency.keys()))
def get_doc_en(self): docs = [ "I read the news today oh boy About a lucky man who made the grade", "I saw a film today oh boy The English Army had just won the war", "It’s been a hard days night, and I been working like a dog", "It’s been a hard days night, I should be sleeping like a dog", "You say you want a revolution", "You tell me that it's evolution" ] doc = Document.load_docs(docs, lang="en") return doc
def test_split(self): d = self.get_documents() doc = Document.load_docs(d) right_size = 1 ld, rd = doc.split(right_rate_or_size=right_size) self.assertEqual(len(d) - right_size, len(ld.archives)) self.assertEqual(right_size, len(rd.archives)) self.assertLess(len(ld.vocab), len(doc.vocab)) self.assertLess(len(rd.vocab), len(ld.vocab))
def test_document(self): d = self.get_documents() doc = Document.load_docs(d) self.assertEqual(len(d), len(doc.archives)) c = Counter() for a in doc.archives: for v in a: c[v[0]] += v[1] freq = c.most_common() get_freq = lambda j: [f for f in freq if f[0] == j][0][1] for i, v in enumerate(doc.vocab): self.assertEqual(doc.frequency[i], get_freq(i)) doc.cut_under(1) for k, v in sorted(doc.frequency.items(), reverse=True, key=lambda f: f[1]): print(doc.vocab[k].surface, v)