def setUp(self):
        super(TestCorpusCombiner, self).setUp()
        # 3 documents
        p1 = self.Project(ref=u'2aeb2e7c78259833e1218b69f99dab3acd00970c',
                          level='file',
                          src_path=self.basepath)
        self.corpus1 = SnapshotCorpus(repo=self.repo,
                                      project=p1,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs1 = list(self.corpus1)

        # 3 old documents + 2 new documents
        p2 = self.Project(ref=u'3587d37e7d476ddc7b673c41762dc89c8ca63a6a',
                          level='file',
                          src_path=self.basepath)
        self.corpus2 = SnapshotCorpus(repo=self.repo,
                                      project=p2,
                                      remove_stops=False,
                                      lower=True,
                                      split=True,
                                      min_len=0)
        self.docs2 = list(self.corpus2)

        self.corpus = CorpusCombiner([self.corpus1, self.corpus2])
        self.docs = list(self.corpus)
 def setUp(self):
     super(TestSnapshotCorpus, self).setUp()
     self.corpus = SnapshotCorpus(repo=self.repo,
                                  remove_stops=False,
                                  lower=True,
                                  split=True,
                                  min_len=0)
     self.docs = list(self.corpus)
 def setUp(self):
     super(TestSnapshotCorpusAtRef, self).setUp()
     p1 = self.Project(ref=u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9',
                       level='file',
                       src_path=self.basepath)
     self.corpus = SnapshotCorpus(repo=self.repo,
                                  project=p1,
                                  remove_stops=False,
                                  lower=True,
                                  split=True,
                                  min_len=0)
     self.docs = list(self.corpus)
 def setUp(self):
     super(TestSnapshotCorpus, self).setUp()
     self.corpus = SnapshotCorpus(repo=self.repo,
                                  remove_stops=False,
                                  lower=True,
                                  split=True,
                                  min_len=0)
     self.docs = list(self.corpus)
 def setUp(self):
     super(TestSnapshotCorpusAtRef, self).setUp()
     p1 = self.Project(ref=u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9',
                       level='file',
                       src_path=self.basepath)
     self.corpus = SnapshotCorpus(repo=self.repo,
                                  project=p1,
                                  remove_stops=False,
                                  lower=True,
                                  split=True,
                                  min_len=0)
     self.docs = list(self.corpus)
    def test_lazy(self):
        corpus = SnapshotCorpus(repo=self.repo,
                                remove_stops=False,
                                lower=True,
                                split=True,
                                min_len=0,
                                lazy_dict=True)

        self.assertEqual(len(corpus.id2word), 0)

        # if lazy, iterating over the corpus will now build the dict
        docs = list(corpus)

        self.assertGreater(len(corpus.id2word), 0)
class TestSnapshotCorpus(TestGitCorpus):
    def setUp(self):
        super(TestSnapshotCorpus, self).setUp()
        self.corpus = SnapshotCorpus(repo=self.repo,
                                     remove_stops=False,
                                     lower=True,
                                     split=True,
                                     min_len=0)
        self.docs = list(self.corpus)

    def test_lazy(self):
        corpus = SnapshotCorpus(repo=self.repo,
                                remove_stops=False,
                                lower=True,
                                split=True,
                                min_len=0,
                                lazy_dict=True)

        self.assertEqual(len(corpus.id2word), 0)

        # if lazy, iterating over the corpus will now build the dict
        docs = list(corpus)

        self.assertGreater(len(corpus.id2word), 0)

    def test_length(self):
        self.assertEqual(len(self.corpus), 10)
        self.assertEqual(len(self.docs), 10)

        l = len(self.corpus)
        for _ in self.corpus:
            self.assertEqual(l, len(self.corpus))

    def test_get_texts(self):
        documents = [
            [
                u'human', u'machine', u'interface', u'for', u'lab', u'abc',
                u'computer', u'applications'
            ],
            [
                u'a', u'survey', u'of', u'user', u'opinion', u'of',
                u'computer', u'system', u'response', u'time'
            ],
            [u'the', u'eps', u'user', u'interface', u'management', u'system'],
            [
                u'system', u'and', u'human', u'system', u'engineering',
                u'testing', u'of', u'eps'
            ],
            [
                u'relation', u'of', u'user', u'perceived', u'response',
                u'time', u'to', u'error', u'measurement'
            ],
            [
                u'the', u'generation', u'of', u'random', u'binary',
                u'unordered', u'trees'
            ],
            [
                u'the', u'intersection', u'graph', u'of', u'paths', u'in',
                u'trees'
            ],
            [
                u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and',
                u'well', u'quasi', u'ordering'
            ],
            [u'graph', u'minors', u'a', u'survey'],
            [u'graph', u'minors', u'a', u'survey'],
            [u'graph', u'minors', u'a', u'survey'],
        ]

        for doc in self.corpus.get_texts():
            doc = list(doc)  # generators, woo?
            self.assertIn(doc, documents)

    def test_metadata_get_texts(self):
        self.corpus.metadata = True

        documents = [
            ([
                u'human', u'machine', u'interface', u'for', u'lab', u'abc',
                u'computer', u'applications'
            ], ('a/0.txt', u'test_git')),
            ([
                u'a', u'survey', u'of', u'user', u'opinion', u'of',
                u'computer', u'system', u'response', u'time'
            ], ('a/1.txt', u'test_git')),
            ([u'the', u'eps', u'user', u'interface', u'management',
              u'system'], ('b/2.txt', u'test_git')),
            ([
                u'system', u'and', u'human', u'system', u'engineering',
                u'testing', u'of', u'eps'
            ], ('b/3.txt', u'test_git')),
            ([
                u'relation', u'of', u'user', u'perceived', u'response',
                u'time', u'to', u'error', u'measurement'
            ], ('c/4.txt', u'test_git')),
            ([
                u'the', u'generation', u'of', u'random', u'binary',
                u'unordered', u'trees'
            ], ('c/e/5.txt', u'test_git')),
            ([
                u'the', u'intersection', u'graph', u'of', u'paths', u'in',
                u'trees'
            ], ('c/f/6.txt', u'test_git')),
            ([
                u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and',
                u'well', u'quasi', u'ordering'
            ], ('7.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),
        ]

        for docmeta in self.corpus.get_texts():
            doc, meta = docmeta
            doc = list(doc)  # generators, woo?
            docmeta = doc, meta  # get a non (generator, metadata) pair
            self.assertIn(docmeta, documents)

    def test_docs(self):
        documents = [
            [
                (u'human', 1),
                (u'machine', 1),
                (u'interface', 1),
                (u'for', 1),
                (u'lab', 1),
                (u'abc', 1),
                (u'computer', 1),
                (u'applications', 1),
            ],
            [
                (u'a', 1),
                (u'survey', 1),
                (u'of', 2),
                (u'user', 1),
                (u'opinion', 1),
                (u'computer', 1),
                (u'system', 1),
                (u'response', 1),
                (u'time', 1),
            ],
            [
                (u'the', 1),
                (u'eps', 1),
                (u'user', 1),
                (u'interface', 1),
                (u'management', 1),
                (u'system', 1),
            ],
            [
                (u'system', 2),
                (u'and', 1),
                (u'human', 1),
                (u'engineering', 1),
                (u'testing', 1),
                (u'of', 1),
                (u'eps', 1),
            ],
            [
                (u'relation', 1),
                (u'of', 1),
                (u'user', 1),
                (u'perceived', 1),
                (u'response', 1),
                (u'time', 1),
                (u'to', 1),
                (u'error', 1),
                (u'measurement', 1),
            ],
            [
                (u'the', 1),
                (u'generation', 1),
                (u'of', 1),
                (u'random', 1),
                (u'binary', 1),
                (u'unordered', 1),
                (u'trees', 1),
            ],
            [
                (u'the', 1),
                (u'intersection', 1),
                (u'graph', 1),
                (u'of', 1),
                (u'paths', 1),
                (u'in', 1),
                (u'trees', 1),
            ],
            [
                (u'graph', 1),
                (u'minors', 1),
                (u'iv', 1),
                (u'widths', 1),
                (u'of', 1),
                (u'trees', 1),
                (u'and', 1),
                (u'well', 1),
                (u'quasi', 1),
                (u'ordering', 1),
            ],
            [
                (u'graph', 1),
                (u'minors', 1),
                (u'a', 1),
                (u'survey', 1),
            ],
        ]

        documents = [set(x) for x in documents]

        for doc in self.corpus:
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set(
                (unicode(self.corpus.id2word[x[0]]), x[1]) for x in doc)
            self.assertIn(textdoc, documents)
class TestSnapshotCorpusAtRef(TestGitCorpus):
    def setUp(self):
        super(TestSnapshotCorpusAtRef, self).setUp()
        p1 = self.Project(ref=u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9',
                          level='file',
                          src_path=self.basepath)
        self.corpus = SnapshotCorpus(repo=self.repo,
                                     project=p1,
                                     remove_stops=False,
                                     lower=True,
                                     split=True,
                                     min_len=0)
        self.docs = list(self.corpus)

    def test_lazy(self):
        corpus = SnapshotCorpus(repo=self.repo,
                                remove_stops=False,
                                lower=True,
                                split=True,
                                min_len=0,
                                lazy_dict=True)

        self.assertEqual(len(corpus.id2word), 0)

        # if lazy, iterating over the corpus will now build the dict
        docs = list(corpus)

        self.assertGreater(len(corpus.id2word), 0)

    def test_length(self):
        self.assertEqual(len(self.corpus), 7)
        self.assertEqual(len(self.docs), 7)

        l = len(self.corpus)
        for _ in self.corpus:
            self.assertEqual(l, len(self.corpus))

    def test_get_texts(self):
        documents = [
            [
                u'human', u'machine', u'interface', u'for', u'lab', u'abc',
                u'computer', u'applications'
            ],
            [
                u'a', u'survey', u'of', u'user', u'opinion', u'of',
                u'computer', u'system', u'response', u'time'
            ],
            [u'the', u'eps', u'user', u'interface', u'management', u'system'],
            [
                u'system', u'and', u'human', u'system', u'engineering',
                u'testing', u'of', u'eps'
            ],
            [u'graph', u'minors', u'a', u'survey'],
            [u'graph', u'minors', u'a', u'survey'],
            [u'graph', u'minors', u'a', u'survey'],
        ]

        for doc in self.corpus.get_texts():
            doc = list(doc)  # generators, woo?
            self.assertIn(doc, documents)

    def test_metadata_get_texts(self):
        self.corpus.metadata = True

        documents = [
            ([
                u'human', u'machine', u'interface', u'for', u'lab', u'abc',
                u'computer', u'applications'
            ], ('a/0.txt', u'test_git')),
            ([
                u'a', u'survey', u'of', u'user', u'opinion', u'of',
                u'computer', u'system', u'response', u'time'
            ], ('a/1.txt', u'test_git')),
            ([u'the', u'eps', u'user', u'interface', u'management',
              u'system'], ('b/2.txt', u'test_git')),
            ([
                u'system', u'and', u'human', u'system', u'engineering',
                u'testing', u'of', u'eps'
            ], ('b/3.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),
        ]

        for docmeta in self.corpus.get_texts():
            doc, meta = docmeta
            doc = list(doc)  # generators, woo?
            docmeta = doc, meta  # get a non (generator, metadata) pair
            self.assertIn(docmeta, documents)

    def test_docs(self):
        documents = [
            [
                (u'human', 1),
                (u'machine', 1),
                (u'interface', 1),
                (u'for', 1),
                (u'lab', 1),
                (u'abc', 1),
                (u'computer', 1),
                (u'applications', 1),
            ],
            [
                (u'a', 1),
                (u'survey', 1),
                (u'of', 2),
                (u'user', 1),
                (u'opinion', 1),
                (u'computer', 1),
                (u'system', 1),
                (u'response', 1),
                (u'time', 1),
            ],
            [
                (u'the', 1),
                (u'eps', 1),
                (u'user', 1),
                (u'interface', 1),
                (u'management', 1),
                (u'system', 1),
            ],
            [
                (u'system', 2),
                (u'and', 1),
                (u'human', 1),
                (u'engineering', 1),
                (u'testing', 1),
                (u'of', 1),
                (u'eps', 1),
            ],
            [
                (u'graph', 1),
                (u'minors', 1),
                (u'a', 1),
                (u'survey', 1),
            ],
        ]

        documents = [set(x) for x in documents]

        for doc in self.corpus:
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set(
                (unicode(self.corpus.id2word[x[0]]), x[1]) for x in doc)
            self.assertIn(textdoc, documents)
class TestSnapshotCorpus(TestGitCorpus):
    def setUp(self):
        super(TestSnapshotCorpus, self).setUp()
        self.corpus = SnapshotCorpus(repo=self.repo,
                                     remove_stops=False,
                                     lower=True,
                                     split=True,
                                     min_len=0)
        self.docs = list(self.corpus)

    def test_lazy(self):
        corpus = SnapshotCorpus(repo=self.repo,
                                remove_stops=False,
                                lower=True,
                                split=True,
                                min_len=0,
                                lazy_dict=True)

        self.assertEqual(len(corpus.id2word), 0)

        # if lazy, iterating over the corpus will now build the dict
        docs = list(corpus)

        self.assertGreater(len(corpus.id2word), 0)


    def test_length(self):
        self.assertEqual(len(self.corpus), 10)
        self.assertEqual(len(self.docs), 10)

        l = len(self.corpus)
        for _ in self.corpus:
            self.assertEqual(l, len(self.corpus))

    def test_get_texts(self):
        documents = [
                [u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'],
                [u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'],
                [u'the', u'eps', u'user', u'interface', u'management', u'system'],
                [u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps'],
                [u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'],
                [u'the', u'generation', u'of', u'random', u'binary', u'unordered', u'trees'],
                [u'the', u'intersection', u'graph', u'of', u'paths', u'in', u'trees'],
                [u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and', u'well', u'quasi', u'ordering'],
                [u'graph', u'minors', u'a', u'survey'],
                [u'graph', u'minors', u'a', u'survey'],
                [u'graph', u'minors', u'a', u'survey'],
                ]

        for doc in self.corpus.get_texts():
            doc = list(doc) # generators, woo?
            self.assertIn(doc, documents)

    def test_metadata_get_texts(self):
        self.corpus.metadata = True

        documents = [
                ([u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'],
                    ('a/0.txt', u'test_git')),
                ([u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'],
                    ('a/1.txt', u'test_git')),
                ([u'the', u'eps', u'user', u'interface', u'management', u'system'],
                    ('b/2.txt', u'test_git')),
                ([u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps'],
                    ('b/3.txt', u'test_git')),
                ([u'relation', u'of', u'user', u'perceived', u'response', u'time', u'to', u'error', u'measurement'],
                    ('c/4.txt', u'test_git')),
                ([u'the', u'generation', u'of', u'random', u'binary', u'unordered', u'trees'],
                    ('c/e/5.txt', u'test_git')),
                ([u'the', u'intersection', u'graph', u'of', u'paths', u'in', u'trees'],
                    ('c/f/6.txt', u'test_git')),
                ([u'graph', u'minors', u'iv', u'widths', u'of', u'trees', u'and', u'well', u'quasi', u'ordering'],
                    ('7.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('dos.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('mac.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('unix.txt', u'test_git')),
                ]

        for docmeta in self.corpus.get_texts():
            doc, meta = docmeta
            doc = list(doc) # generators, woo?
            docmeta = doc, meta # get a non (generator, metadata) pair
            self.assertIn(docmeta, documents)

    def test_docs(self):
        documents = [
                [
                    (u'human', 1),
                    (u'machine', 1),
                    (u'interface', 1),
                    (u'for', 1),
                    (u'lab', 1),
                    (u'abc', 1),
                    (u'computer', 1),
                    (u'applications', 1),
                    ],

                [
                    (u'a', 1),
                    (u'survey', 1),
                    (u'of', 2),
                    (u'user', 1),
                    (u'opinion', 1),
                    (u'computer', 1),
                    (u'system', 1),
                    (u'response', 1),
                    (u'time', 1),
                    ],

                [
                    (u'the', 1),
                    (u'eps', 1),
                    (u'user', 1),
                    (u'interface', 1),
                    (u'management', 1),
                    (u'system', 1),
                    ],

                [
                    (u'system', 2),
                    (u'and', 1),
                    (u'human', 1),
                    (u'engineering', 1),
                    (u'testing', 1),
                    (u'of', 1),
                    (u'eps', 1),
                    ],

                [
                    (u'relation', 1),
                    (u'of', 1),
                    (u'user', 1),
                    (u'perceived', 1),
                    (u'response', 1),
                    (u'time', 1),
                    (u'to', 1),
                    (u'error', 1),
                    (u'measurement', 1),
                    ],

                [
                    (u'the', 1),
                    (u'generation', 1),
                    (u'of', 1),
                    (u'random', 1),
                    (u'binary', 1),
                    (u'unordered', 1),
                    (u'trees', 1),
                    ],

                [
                    (u'the', 1),
                    (u'intersection', 1),
                    (u'graph', 1),
                    (u'of', 1),
                    (u'paths', 1),
                    (u'in', 1),
                    (u'trees', 1),
                    ],

                [
                    (u'graph', 1),
                    (u'minors', 1),
                    (u'iv', 1),
                    (u'widths', 1),
                    (u'of', 1),
                    (u'trees', 1),
                    (u'and', 1),
                    (u'well', 1),
                    (u'quasi', 1),
                    (u'ordering', 1),
                    ],

                [
                    (u'graph', 1),
                    (u'minors', 1),
                    (u'a', 1),
                    (u'survey', 1),
                    ],
                ]

        documents = [set(x) for x in documents]

        for doc in self.corpus:
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set((unicode(self.corpus.id2word[x[0]]), x[1]) for x in doc)
            self.assertIn(textdoc, documents)
class TestSnapshotCorpusAtRef(TestGitCorpus):
    def setUp(self):
        super(TestSnapshotCorpusAtRef, self).setUp()
        p1 = self.Project(ref=u'f33a0fb070a34fc1b9105453b3ffb4edc49131d9',
                          level='file',
                          src_path=self.basepath)
        self.corpus = SnapshotCorpus(repo=self.repo,
                                     project=p1,
                                     remove_stops=False,
                                     lower=True,
                                     split=True,
                                     min_len=0)
        self.docs = list(self.corpus)


    def test_lazy(self):
        corpus = SnapshotCorpus(repo=self.repo,
                                remove_stops=False,
                                lower=True,
                                split=True,
                                min_len=0,
                                lazy_dict=True)

        self.assertEqual(len(corpus.id2word), 0)

        # if lazy, iterating over the corpus will now build the dict
        docs = list(corpus)

        self.assertGreater(len(corpus.id2word), 0)

    def test_length(self):
        self.assertEqual(len(self.corpus), 7)
        self.assertEqual(len(self.docs), 7)

        l = len(self.corpus)
        for _ in self.corpus:
            self.assertEqual(l, len(self.corpus))

    def test_get_texts(self):
        documents = [
                [u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'],
                [u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'],
                [u'the', u'eps', u'user', u'interface', u'management', u'system'],
                [u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps'],
                [u'graph', u'minors', u'a', u'survey'],
                [u'graph', u'minors', u'a', u'survey'],
                [u'graph', u'minors', u'a', u'survey'],
                ]

        for doc in self.corpus.get_texts():
            doc = list(doc) # generators, woo?
            self.assertIn(doc, documents)

    def test_metadata_get_texts(self):
        self.corpus.metadata = True

        documents = [
                ([u'human', u'machine', u'interface', u'for', u'lab', u'abc', u'computer', u'applications'],
                    ('a/0.txt', u'test_git')),
                ([u'a', u'survey', u'of', u'user', u'opinion', u'of', u'computer', u'system', u'response', u'time'],
                    ('a/1.txt', u'test_git')),
                ([u'the', u'eps', u'user', u'interface', u'management', u'system'],
                    ('b/2.txt', u'test_git')),
                ([u'system', u'and', u'human', u'system', u'engineering', u'testing', u'of', u'eps'],
                    ('b/3.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('dos.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('mac.txt', u'test_git')),
                ([u'graph', u'minors', u'a', u'survey'],
                    ('unix.txt', u'test_git')),
                ]

        for docmeta in self.corpus.get_texts():
            doc, meta = docmeta
            doc = list(doc) # generators, woo?
            docmeta = doc, meta # get a non (generator, metadata) pair
            self.assertIn(docmeta, documents)

    def test_docs(self):
        documents = [
                [
                    (u'human', 1),
                    (u'machine', 1),
                    (u'interface', 1),
                    (u'for', 1),
                    (u'lab', 1),
                    (u'abc', 1),
                    (u'computer', 1),
                    (u'applications', 1),
                    ],

                [
                    (u'a', 1),
                    (u'survey', 1),
                    (u'of', 2),
                    (u'user', 1),
                    (u'opinion', 1),
                    (u'computer', 1),
                    (u'system', 1),
                    (u'response', 1),
                    (u'time', 1),
                    ],

                [
                    (u'the', 1),
                    (u'eps', 1),
                    (u'user', 1),
                    (u'interface', 1),
                    (u'management', 1),
                    (u'system', 1),
                    ],

                [
                    (u'system', 2),
                    (u'and', 1),
                    (u'human', 1),
                    (u'engineering', 1),
                    (u'testing', 1),
                    (u'of', 1),
                    (u'eps', 1),
                    ],

                [
                    (u'graph', 1),
                    (u'minors', 1),
                    (u'a', 1),
                    (u'survey', 1),
                    ],
                ]

        documents = [set(x) for x in documents]

        for doc in self.corpus:
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set((unicode(self.corpus.id2word[x[0]]), x[1]) for x in doc)
            self.assertIn(textdoc, documents)