def setUp(self):
        super(TestCorpusCombiner, self).setUp()
        # 3 documents
        p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c',
                          level='file',
                          src_path=self.basepath)
        self.corpus1 = SnapshotCorpus(repo=self.repo,
                                      project=p1,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs1 = list(self.corpus1)

        # 3 old documents + 2 new documents
        p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a',
                          level='file',
                          src_path=self.basepath)
        self.corpus2 = SnapshotCorpus(repo=self.repo,
                                      project=p2,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs2 = list(self.corpus2)

        self.corpus = CorpusCombiner([self.corpus1, self.corpus2])
        self.docs = list(self.corpus)
    def setUp(self):
        super(TestCorpusCombiner, self).setUp()
        # 3 documents
        p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c',
                          level='file',
                          src_path=self.basepath)
        self.corpus1 = SnapshotCorpus(repo=self.repo,
                                      project=p1,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs1 = list(self.corpus1)

        # 3 old documents + 2 new documents
        p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a',
                          level='file',
                          src_path=self.basepath)
        self.corpus2 = SnapshotCorpus(repo=self.repo,
                                      project=p2,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs2 = list(self.corpus2)


        self.corpus = CorpusCombiner([self.corpus1, self.corpus2])
        self.docs = list(self.corpus)
class TestCorpusCombiner(TestGitCorpus):
    def setUp(self):
        super(TestCorpusCombiner, self).setUp()
        # 3 documents
        p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c',
                          level='file',
                          src_path=self.basepath)
        self.corpus1 = SnapshotCorpus(repo=self.repo,
                                      project=p1,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs1 = list(self.corpus1)

        # 3 old documents + 2 new documents
        p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a',
                          level='file',
                          src_path=self.basepath)
        self.corpus2 = SnapshotCorpus(repo=self.repo,
                                      project=p2,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs2 = list(self.corpus2)

        self.corpus = CorpusCombiner([self.corpus1, self.corpus2])
        self.docs = list(self.corpus)

    def test_length(self):
        self.assertEqual(len(self.corpus), 8)
        self.assertEqual(len(self.docs), 8)

        l = len(self.corpus)
        for _ in self.corpus:
            self.assertEqual(l, len(self.corpus))

    def test_metadata_docs(self):
        documents = [
            # corpus1
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),

            # corpus2
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),
            ([
                u'human', u'machine', u'interface', u'for', u'lab', u'abc',
                u'computer', u'applications'
            ], ('a/0.txt', u'test_git')),
            ([
                u'a', u'survey', u'of', u'user', u'opinion', u'of',
                u'computer', u'system', u'response', u'time'
            ], ('a/1.txt', u'test_git')),
        ]

        documents = [(set(x), y) for x, y in documents]

        self.corpus.metadata = True
        vals = [
            self.corpus.metadata, self.corpus._metadata,
            self.corpus.corpora[0].metadata, self.corpus.corpora[1].metadata
        ]
        self.assertTrue(all(vals))

        for docmeta in self.corpus:
            doc, meta = docmeta
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc)
            docmeta = textdoc, meta
            self.assertIn(docmeta, documents)

    def test_mallet_corpus(self):
        with open(self.tempfname, 'w') as f:
            f.write('abc en fred flintstone\n')
            f.write('efg en barney rubble\n')
        corpus3 = MalletCorpus(self.tempfname)
        self.assertEqual(len(corpus3), 2)
        self.corpus.add(corpus3)
        self.assertEqual(len(self.corpus), 10)

        documents = [
            # corpus1
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),

            # corpus2
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),
            ([
                u'human', u'machine', u'interface', u'for', u'lab', u'abc',
                u'computer', u'applications'
            ], ('a/0.txt', u'test_git')),
            ([
                u'a', u'survey', u'of', u'user', u'opinion', u'of',
                u'computer', u'system', u'response', u'time'
            ], ('a/1.txt', u'test_git')),

            # mallet
            ([u'fred', u'flintstone'], ('abc', u'en')),
            ([u'barney', u'rubble'], ('efg', u'en')),
        ]

        documents = [(set(x), y) for x, y in documents]

        self.corpus.metadata = True
        vals = [
            self.corpus.metadata, self.corpus._metadata,
            self.corpus.corpora[0].metadata, self.corpus.corpora[1].metadata,
            self.corpus.corpora[2].metadata
        ]
        self.assertTrue(all(vals))

        for docmeta in self.corpus:
            doc, meta = docmeta
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc)
            docmeta = textdoc, meta
            self.assertIn(docmeta, documents)
class TestCorpusCombiner(TestGitCorpus):
    def setUp(self):
        super(TestCorpusCombiner, self).setUp()
        # 3 documents
        p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c',
                          level='file',
                          src_path=self.basepath)
        self.corpus1 = SnapshotCorpus(repo=self.repo,
                                      project=p1,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs1 = list(self.corpus1)

        # 3 old documents + 2 new documents
        p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a',
                          level='file',
                          src_path=self.basepath)
        self.corpus2 = SnapshotCorpus(repo=self.repo,
                                      project=p2,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs2 = list(self.corpus2)


        self.corpus = CorpusCombiner([self.corpus1, self.corpus2])
        self.docs = list(self.corpus)

    def test_length(self):
        self.assertEqual(len(self.corpus), 8)
        self.assertEqual(len(self.docs), 8)

        l = len(self.corpus)
        for _ in self.corpus:
            self.assertEqual(l, len(self.corpus))

    def test_metadata_docs(self):
        documents = [
                # corpus1
                ([u'graph', u'minors', u'a', u'survey'],
                    ('dos.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('mac.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('unix.txt', u'test_git')),

                # corpus2
                ([u'graph', u'minors', u'a', u'survey'],
                    ('dos.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('mac.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('unix.txt', u'test_git')),
                ([u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'],
                    ('a/0.txt', u'test_git')),
                ([u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'],
                    ('a/1.txt', u'test_git')),
                ]

        documents = [(set(x),y) for x,y in documents]

        self.corpus.metadata = True
        vals = [self.corpus.metadata,
                self.corpus._metadata,
                self.corpus.corpora[0].metadata,
                self.corpus.corpora[1].metadata]
        self.assertTrue(all(vals))

        for docmeta in self.corpus:
            doc, meta = docmeta
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc)
            docmeta = textdoc, meta
            self.assertIn(docmeta, documents)

    def test_mallet_corpus(self):
        with open(self.tempfname, 'w') as f:
            f.write('abc en fred flintstone\n')
            f.write('efg en barney rubble\n')
        corpus3 = MalletCorpus(self.tempfname)
        self.assertEqual(len(corpus3), 2)
        self.corpus.add(corpus3)
        self.assertEqual(len(self.corpus), 10)

        documents = [
                # corpus1
                ([u'graph', u'minors', u'a', u'survey'],
                    ('dos.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('mac.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('unix.txt', u'test_git')),

                # corpus2
                ([u'graph', u'minors', u'a', u'survey'],
                    ('dos.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('mac.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('unix.txt', u'test_git')),
                ([u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'],
                    ('a/0.txt', u'test_git')),
                ([u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'],
                    ('a/1.txt', u'test_git')),

                # mallet
                ([u'fred', u'flintstone'],
                    ('abc', u'en')),
                ([u'barney', u'rubble'],
                    ('efg', u'en')),
                ]

        documents = [(set(x),y) for x,y in documents]

        self.corpus.metadata = True
        vals = [self.corpus.metadata,
                self.corpus._metadata,
                self.corpus.corpora[0].metadata,
                self.corpus.corpora[1].metadata,
                self.corpus.corpora[2].metadata]
        self.assertTrue(all(vals))

        for docmeta in self.corpus:
            doc, meta = docmeta
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc)
            docmeta = textdoc, meta
            self.assertIn(docmeta, documents)