def setUp(self): super(TestCorpusCombiner, self).setUp() # 3 documents p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c', level='file', src_path=self.basepath) self.corpus1 = SnapshotCorpus(repo=self.repo, project=p1, remove_stops=False, lower=True, split=True, min_len=0) self.docs1 = list(self.corpus1) # 3 old documents + 2 new documents p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a', level='file', src_path=self.basepath) self.corpus2 = SnapshotCorpus(repo=self.repo, project=p2, remove_stops=False, lower=True, split=True, min_len=0) self.docs2 = list(self.corpus2) self.corpus = CorpusCombiner([self.corpus1, self.corpus2]) self.docs = list(self.corpus)
class TestCorpusCombiner(TestGitCorpus): def setUp(self): super(TestCorpusCombiner, self).setUp() # 3 documents p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c', level='file', src_path=self.basepath) self.corpus1 = SnapshotCorpus(repo=self.repo, project=p1, remove_stops=False, lower=True, split=True, min_len=0) self.docs1 = list(self.corpus1) # 3 old documents + 2 new documents p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a', level='file', src_path=self.basepath) self.corpus2 = SnapshotCorpus(repo=self.repo, project=p2, remove_stops=False, lower=True, split=True, min_len=0) self.docs2 = list(self.corpus2) self.corpus = CorpusCombiner([self.corpus1, self.corpus2]) self.docs = list(self.corpus) def test_length(self): self.assertEqual(len(self.corpus), 8) self.assertEqual(len(self.docs), 8) l = len(self.corpus) for _ in self.corpus: self.assertEqual(l, len(self.corpus)) def test_metadata_docs(self): documents = [ # corpus1 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), # corpus2 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), ([ u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications' ], ('a/0.txt', u'test_git')), ([ u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time' ], ('a/1.txt', u'test_git')), ] documents = [(set(x), y) for x, y in documents] self.corpus.metadata = True vals = [ self.corpus.metadata, self.corpus._metadata, self.corpus.corpora[0].metadata, self.corpus.corpora[1].metadata ] self.assertTrue(all(vals)) for docmeta in self.corpus: doc, meta = docmeta self.assertGreater(len(doc), 0) # convert the document to text freq since we don't know the # term ids ahead of time for testing. textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc) docmeta = textdoc, meta self.assertIn(docmeta, documents) def test_mallet_corpus(self): with open(self.tempfname, 'w') as f: f.write('abc en fred flintstone\n') f.write('efg en barney rubble\n') corpus3 = MalletCorpus(self.tempfname) self.assertEqual(len(corpus3), 2) self.corpus.add(corpus3) self.assertEqual(len(self.corpus), 10) documents = [ # corpus1 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), # corpus2 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), ([ u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications' ], ('a/0.txt', u'test_git')), ([ u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time' ], ('a/1.txt', u'test_git')), # mallet ([u'fred', u'flintstone'], ('abc', u'en')), ([u'barney', u'rubble'], ('efg', u'en')), ] documents = [(set(x), y) for x, y in documents] self.corpus.metadata = True vals = [ self.corpus.metadata, self.corpus._metadata, self.corpus.corpora[0].metadata, self.corpus.corpora[1].metadata, self.corpus.corpora[2].metadata ] self.assertTrue(all(vals)) for docmeta in self.corpus: doc, meta = docmeta self.assertGreater(len(doc), 0) # convert the document to text freq since we don't know the # term ids ahead of time for testing. textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc) docmeta = textdoc, meta self.assertIn(docmeta, documents)
class TestCorpusCombiner(TestGitCorpus): def setUp(self): super(TestCorpusCombiner, self).setUp() # 3 documents p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c', level='file', src_path=self.basepath) self.corpus1 = SnapshotCorpus(repo=self.repo, project=p1, remove_stops=False, lower=True, split=True, min_len=0) self.docs1 = list(self.corpus1) # 3 old documents + 2 new documents p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a', level='file', src_path=self.basepath) self.corpus2 = SnapshotCorpus(repo=self.repo, project=p2, remove_stops=False, lower=True, split=True, min_len=0) self.docs2 = list(self.corpus2) self.corpus = CorpusCombiner([self.corpus1, self.corpus2]) self.docs = list(self.corpus) def test_length(self): self.assertEqual(len(self.corpus), 8) self.assertEqual(len(self.docs), 8) l = len(self.corpus) for _ in self.corpus: self.assertEqual(l, len(self.corpus)) def test_metadata_docs(self): documents = [ # corpus1 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), # corpus2 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), ([u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'], ('a/0.txt', u'test_git')), ([u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'], ('a/1.txt', u'test_git')), ] documents = [(set(x),y) for x,y in documents] self.corpus.metadata = True vals = [self.corpus.metadata, self.corpus._metadata, self.corpus.corpora[0].metadata, self.corpus.corpora[1].metadata] self.assertTrue(all(vals)) for docmeta in self.corpus: doc, meta = docmeta self.assertGreater(len(doc), 0) # convert the document to text freq since we don't know the # term ids ahead of time for testing. textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc) docmeta = textdoc, meta self.assertIn(docmeta, documents) def test_mallet_corpus(self): with open(self.tempfname, 'w') as f: f.write('abc en fred flintstone\n') f.write('efg en barney rubble\n') corpus3 = MalletCorpus(self.tempfname) self.assertEqual(len(corpus3), 2) self.corpus.add(corpus3) self.assertEqual(len(self.corpus), 10) documents = [ # corpus1 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), # corpus2 ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')), ([u'graph', u'minors', u'a', u'survey'], ('unix.txt', u'test_git')), ([u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'], ('a/0.txt', u'test_git')), ([u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'], ('a/1.txt', u'test_git')), # mallet ([u'fred', u'flintstone'], ('abc', u'en')), ([u'barney', u'rubble'], ('efg', u'en')), ] documents = [(set(x),y) for x,y in documents] self.corpus.metadata = True vals = [self.corpus.metadata, self.corpus._metadata, self.corpus.corpora[0].metadata, self.corpus.corpora[1].metadata, self.corpus.corpora[2].metadata] self.assertTrue(all(vals)) for docmeta in self.corpus: doc, meta = docmeta self.assertGreater(len(doc), 0) # convert the document to text freq since we don't know the # term ids ahead of time for testing. textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc) docmeta = textdoc, meta self.assertIn(docmeta, documents)