def test_remove(self):
        import random
        d = self.get_documents()
        doc = Document.load_docs(d)
        confirmation_id = len(doc.vocab) - 1

        def get_info(target_id):
            v = doc.vocab[target_id]
            f = doc.frequency[target_id]
            c = len([w for w in [a for a in doc.archives] if w[0] == target_id])
            return (v, f, c)

        removed = ["食材"]
        old_v = get_info(confirmation_id)
        i = doc.remove_vocab(removed[0])
        target = random.sample(list(doc.vocab.keys())[:-2], 1)[0]
        removed += [doc.vocab[target].surface]
        doc.remove_vocab(target)

        new_id = doc.get_id(old_v[0].surface)
        new_v = get_info(new_id)

        self.assertEqual(new_v[0].surface, old_v[0].surface)
        self.assertEqual(new_v[1], old_v[1])
        self.assertEqual(new_v[2], old_v[2])

        for r in removed:
            self.assertEqual(-1, doc.get_id(r))
    def test_cut_tail(self):
        d = self.get_documents()
        doc = Document.load_docs(d)
        freq = 1
        remained = [f for f in doc.frequency.items() if f[1] > freq]

        doc.cut_under(freq)
        self.assertEqual(len(remained), len(doc.vocab.keys()))
        self.assertEqual(len(remained), len(doc.frequency.keys()))
 def get_doc_en(self):
     docs = [
         "I read the news today oh boy About a lucky man who made the grade",
         "I saw a film today oh boy The English Army had just won the war",
         "It’s been a hard days night, and I been working like a dog",
         "It’s been a hard days night, I should be sleeping like a dog",
         "You say you want a revolution",
         "You tell me that it's evolution"
     ]
     doc = Document.load_docs(docs, lang="en")
     return doc
    def test_split(self):
        d = self.get_documents()
        doc = Document.load_docs(d)

        right_size = 1
        ld, rd = doc.split(right_rate_or_size=right_size)

        self.assertEqual(len(d) - right_size, len(ld.archives))
        self.assertEqual(right_size, len(rd.archives))

        self.assertLess(len(ld.vocab), len(doc.vocab))
        self.assertLess(len(rd.vocab), len(ld.vocab))
    def test_document(self):
        d = self.get_documents()
        doc = Document.load_docs(d)
        self.assertEqual(len(d), len(doc.archives))

        c = Counter()
        for a in doc.archives:
            for v in a:
                c[v[0]] += v[1]
        freq = c.most_common()
        get_freq = lambda j: [f for f in freq if f[0] == j][0][1]

        for i, v in enumerate(doc.vocab):
            self.assertEqual(doc.frequency[i], get_freq(i))

        doc.cut_under(1)
        for k, v in sorted(doc.frequency.items(), reverse=True, key=lambda f: f[1]):
            print(doc.vocab[k].surface, v)