def test_variable_attributes(self):
        c1 = Corpus.from_file('deerwester')
        c2 = Corpus.from_file('deerwester')
        X = np.array([list(range(4)) for _ in range(len(c1))])
        X = sp.csr_matrix(X)

        dictionary = {
            0: 'd',
            1: 'c',
            2: 'b',
            3: 'a',
        }

        c1 = BaseVectorizer.add_features(c1,
                                         X,
                                         dictionary,
                                         compute_values=None,
                                         var_attrs=None)
        c2 = BaseVectorizer.add_features(c2,
                                         X,
                                         dictionary,
                                         compute_values=None,
                                         var_attrs={'foo': 1})

        n_attrs_before = len(c1.domain.attributes[0].attributes)
        n_attrs_after = len(c2.domain.attributes[0].attributes)
        self.assertTrue(n_attrs_after - n_attrs_before, 1)

        for a in c2.domain.attributes:
            self.assertIn('foo', a.attributes)
Esempio n. 2
0
 def test_corpus_from_file_abs_path(self):
     c = Corpus.from_file('book-excerpts')
     path = os.path.dirname(__file__)
     file = os.path.abspath(
         os.path.join(path, '..', 'datasets', 'book-excerpts.tab'))
     c2 = Corpus.from_file(file)
     self.assertEqual(c, c2)
Esempio n. 3
0
 def test_corpus_from_file_abs_path(self):
     c = Corpus.from_file('book-excerpts')
     path = os.path.dirname(__file__)
     file = os.path.abspath(
         os.path.join(path, '..', 'datasets', 'book-excerpts.tab'))
     c2 = Corpus.from_file(file)
     np.testing.assert_array_equal(c.X, c2.X)
     np.testing.assert_array_equal(c.metas, c2.metas)
     self.assertEqual(c.documents, c2.documents)
Esempio n. 4
0
    def test_infer_text_features(self):
        c = Corpus.from_file('friends-transcripts')
        tf = c.text_features
        self.assertEqual(len(tf), 1)
        self.assertEqual(tf[0].name, 'Quote')

        c = Corpus.from_file('deerwester')
        tf = c.text_features
        self.assertEqual(len(tf), 1)
        self.assertEqual(tf[0].name, 'Text')
Esempio n. 5
0
    def test_infer_text_features(self):
        c = Corpus.from_file('friends-transcripts')
        tf = c.text_features
        self.assertEqual(len(tf), 1)
        self.assertEqual(tf[0].name, 'Quote')

        c = Corpus.from_file('deerwester')
        tf = c.text_features
        self.assertEqual(len(tf), 1)
        self.assertEqual(tf[0].name, 'Text')
    def test_compute_values_to_different_domain(self):
        source = Corpus.from_file('deerwester')
        destination = Corpus.from_file('book-excerpts')

        self.assertFalse(source.domain.attributes)
        self.assertFalse(destination.domain.attributes)

        bow = BowVectorizer().transform(source)
        computed = destination.transform(bow.domain)

        self.assertEqual(bow.domain.attributes, computed.domain.attributes)
    def test_compute_values_to_different_domain(self):
        source = Corpus.from_file('deerwester')
        destination = Corpus.from_file('book-excerpts')

        self.assertFalse(source.domain.attributes)
        self.assertFalse(destination.domain.attributes)

        bow = BowVectorizer().transform(source)
        computed = destination.transform(bow.domain)

        self.assertEqual(bow.domain.attributes, computed.domain.attributes)
Esempio n. 8
0
    def test_corpus_from_file(self):
        c = Corpus.from_file('book-excerpts')
        self.assertEqual(len(c), 140)
        self.assertEqual(len(c.domain), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (140, 1))

        c = Corpus.from_file('deerwester')
        self.assertEqual(len(c), 9)
        self.assertEqual(len(c.domain), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (9, 1))
Esempio n. 9
0
    def test_corpus_from_file(self):
        c = Corpus.from_file('book-excerpts')
        self.assertEqual(len(c), 140)
        self.assertEqual(len(c.domain), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (140, 1))

        c = Corpus.from_file('deerwester')
        self.assertEqual(len(c), 9)
        self.assertEqual(len(c.domain), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (9, 1))
Esempio n. 10
0
 def setUp(self):
     self.corpus = Corpus.from_file("deerwester")
     self.pp_list = [preprocess.LowercaseTransformer(),
                     preprocess.WordPunctTokenizer(),
                     preprocess.SnowballStemmer(),
                     preprocess.NGrams(),
                     tag.AveragedPerceptronTagger()]
Esempio n. 11
0
    def test_documents_from_sparse_features(self):
        t = Table.from_file('brown-selected')
        c = Corpus.from_file('brown-selected')
        with c.unlocked():
            c.X = csr_matrix(c.X)

        # docs from X, Y and metas
        docs = c.documents_from_features(
            [t.domain.attributes[0], t.domain.class_var, t.domain.metas[0]])
        self.assertEqual(len(docs), len(t))
        for first_attr, class_val, meta_attr, d in zip(t.X[:, 0], c.Y,
                                                       c.metas[:, 0], docs):
            first_attr = c.domain.attributes[0].str_val(first_attr)
            class_val = c.domain.class_var.str_val(class_val)
            meta_attr = c.domain.metas[0].str_val(meta_attr)
            self.assertIn(class_val, d)
            self.assertIn(first_attr, d)
            self.assertIn(meta_attr, d)

        # docs only from sparse X
        docs = c.documents_from_features([t.domain.attributes[0]])
        self.assertEqual(len(docs), len(t))
        for first_attr, d in zip(t.X[:, 0], docs):
            first_attr = c.domain.attributes[0].str_val(first_attr)
            self.assertIn(first_attr, d)
Esempio n. 12
0
 def test_corpus_from_init(self):
     c = Corpus.from_file('book-excerpts')
     with self.assertWarns(FutureWarning):
         c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, c.text_features)
     np.testing.assert_array_equal(c.X, c2.X)
     np.testing.assert_array_equal(c.metas, c2.metas)
     self.assertEqual(c.documents, c2.documents)
Esempio n. 13
0
    def test_corpus_from_file(self):
        c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.txt'))
        self.assertEqual(len(c), 140)

        self.assertEqual(len(c.domain), 0)
        self.assertEqual(len(c.domain.metas), 2)
        self.assertEqual(c.metas.shape, (140, 2))
Esempio n. 14
0
    def test_corpus_from_file_just_text(self):
        c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.tab'))

        self.assertEqual(len(c), 9)
        self.assertEqual(len(c.domain), 0)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (9, 1))
Esempio n. 15
0
    def open_file(self, path=None, data=None):
        self.closeContext()
        self.Error.clear()
        self.unused_attrs_model[:] = []
        self.used_attrs_model[:] = []
        if data:
            self.corpus = Corpus.from_table(data.domain, data)
        elif path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
            except BaseException as err:
                self.Error.read_file(path, str(err))
        else:
            return

        self.update_info()
        self.used_attrs = list(self.corpus.text_features)
        if not self.corpus.text_features:
            self.Error.corpus_without_text_features()
            self.Outputs.corpus.send(None)
            return
        self.openContext(self.corpus)
        self.used_attrs_model.extend(self.used_attrs)
        self.unused_attrs_model.extend([
            f for f in self.corpus.domain.metas
            if f.is_string and f not in self.used_attrs_model
        ])
Esempio n. 16
0
 def test_preprocess(self):
     pr = preprocess.Preprocessor(
         tokenizer=preprocess.RegexpTokenizer('\w+'),
         pos_tagger=tag.AveragedPerceptronTagger())
     corpus = Corpus.from_file('deerwester')
     pr(corpus, inplace=True)
     self.assertIsNotNone(corpus.pos_tags)
Esempio n. 17
0
    def test_titles(self):
        c = Corpus.from_file('book-excerpts')

        # no title feature set
        titles = c.titles
        self.assertEqual(len(titles), len(c))
        for title in titles:
            self.assertIn('Document ', title)

        # title feature set
        c.set_title_variable(c.domain[0])
        titles = c.titles
        self.assertEqual(len(titles), len(c))

        # first 50 are children
        for title, c in zip(titles[:50], range(1, 51)):
            self.assertEqual(f"children ({c})", title)

        # others are adults
        for title, a in zip(titles[50:100], range(1, 51)):
            self.assertEqual(f"adult ({a})", title)

        # first 50 are children
        for title, c in zip(titles[100:120], range(51, 71)):
            self.assertEqual(f"children ({c})", title)

        # others are adults
        for title, a in zip(titles[120:140], range(51, 71)):
            self.assertEqual(f"adult ({a})", title)
Esempio n. 18
0
    def test_corpus_not_eq(self):
        c = Corpus.from_file('book-excerpts')
        n_doc = c.X.shape[0]

        c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, [])
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W,
                    c.text_features)
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W,
                    c.text_features)
        self.assertNotEqual(c, c2)

        broken_metas = np.copy(c.metas)
        broken_metas[0, 0] = ''
        c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        new_meta = [StringVariable('text2')]
        broken_domain = Domain(c.domain.attributes, c.domain.class_var,
                               new_meta)
        c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta)
        self.assertNotEqual(c, c2)

        c2 = c.copy()
        c2.ngram_range = (2, 4)
        self.assertNotEqual(c, c2)
Esempio n. 19
0
    def test_corpus_not_eq(self):
        c = Corpus.from_file('book-excerpts')
        n_doc = c.X.shape[0]

        c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, [])
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        broken_metas = np.copy(c.metas)
        broken_metas[0, 0] = ''
        c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        new_meta = [StringVariable('text2')]
        broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta)
        c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta)
        self.assertNotEqual(c, c2)

        c2 = c.copy()
        c2.ngram_range = (2, 4)
        self.assertNotEqual(c, c2)
Esempio n. 20
0
    def open_file(self, path=None, data=None):
        self.closeContext()
        self.Error.clear()
        self.unused_attrs_model[:] = []
        self.used_attrs_model[:] = []
        if data:
            self.corpus = Corpus.from_table(data.domain, data)
        elif path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
            except BaseException as err:
                self.Error.read_file(path, str(err))
        else:
            return

        self.update_info()
        self.used_attrs = list(self.corpus.text_features)
        if not self.corpus.text_features:
            self.Error.corpus_without_text_features()
            self.Outputs.corpus.send(None)
            return
        self.openContext(self.corpus)
        self.used_attrs_model.extend(self.used_attrs)
        self.unused_attrs_model.extend(
            [f for f in self.corpus.domain.metas
             if f.is_string and f not in self.used_attrs_model])
Esempio n. 21
0
    def test_extend_attributes(self):
        """
        Test correctness of extending attributes, variables must have unique
        values and must not happen inplace
        """
        # corpus without features
        c = Corpus.from_file('book-excerpts')
        X = np.random.random((len(c), 3))
        new_c = c.extend_attributes(X, ['1', '2', '3'])
        self.assertEqual(new_c.X.shape, (len(c), 3))

        # add to non empty corpus
        new_c = new_c.extend_attributes(X, ['1', '2', '4'])
        self.assertEqual(new_c.X.shape, (len(c), 6))
        self.assertListEqual([a.name for a in new_c.domain.attributes],
                             ['1', '2', '3', '1 (1)', '2 (1)', '4'])
        self.assertEqual(0, len(c.domain.attributes))

        # extend sparse
        new_c = new_c.extend_attributes(csr_matrix(X), ['1', '2', '3'])
        self.assertEqual(new_c.X.shape, (len(c), 9))
        self.assertTrue(issparse(new_c.X))
        self.assertListEqual(
            [a.name for a in new_c.domain.attributes],
            ['1', '2', '3', '1 (1)', '2 (1)', '4', '1 (2)', '2 (2)', '3 (1)'])
        self.assertEqual(0, len(c.domain.attributes))
Esempio n. 22
0
    def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain.variables), 43)
Esempio n. 23
0
    def test_args(self):
        corpus = Corpus.from_file('deerwester')

        BowVectorizer.wglobals['const'] = lambda df, N: 1

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')

        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.BINARY,
                             wglobal='const')
        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.L1,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')
        x = vect.transform(corpus).X
        self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)
Esempio n. 24
0
    def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain), 43)
Esempio n. 25
0
    def test_pp_documents(self):
        c = Corpus.from_file('book-excerpts')
        self.assertEqual(c.documents, c.pp_documents)

        pp_c = preprocess.BASE_TRANSFORMER(c)
        self.assertEqual(c.documents, pp_c.documents)
        self.assertNotEqual(c.pp_documents, pp_c.pp_documents)
Esempio n. 26
0
    def test_corpus_from_file(self):
        dd_before = dataset_dirs.copy()
        c = Corpus.from_file('book-excerpts')
        # from_file temporarily change dataset_dirs
        # test that dataset_dirs remains unchanged after from_file call
        self.assertListEqual(dd_before, dataset_dirs)

        self.assertEqual(len(c), 140)
        self.assertEqual(len(c.domain.variables), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (140, 1))

        c = Corpus.from_file('deerwester')
        self.assertEqual(len(c), 9)
        self.assertEqual(len(c.domain.variables), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (9, 1))
Esempio n. 27
0
    def test_documents(self):
        c = Corpus.from_file('book-excerpts')
        docs = c.documents
        types = set(type(i) for i in docs)

        self.assertEqual(len(docs), len(c))
        self.assertEqual(len(types), 1)
        self.assertIn(str, types)
Esempio n. 28
0
 def test_reset_pos_tags(self):
     corpus = Corpus.from_file('deerwester')
     tagger = tag.AveragedPerceptronTagger()
     tagged_corpus = tagger(corpus)
     self.assertTrue(len(tagged_corpus.pos_tags))
     tokenizer = preprocess.RegexpTokenizer(pattern=r'\w')
     tokenized_corpus = tokenizer(corpus)
     self.assertFalse(tokenized_corpus.pos_tags)
Esempio n. 29
0
    def test_languages(self):
        corpus = Corpus.from_file('deerwester')

        self.assertIsNone(corpus.languages)
        corpus.detect_languages()
        self.assertEqual(len(corpus.languages), len(corpus))
        self.assertListEqual(corpus.languages,
                             ['en' for _ in range(len(corpus))])
Esempio n. 30
0
    def test_titles_read_document(self):
        """
        When we read the document with a title marked it should have titles
        set correctly.
        """
        c = Corpus.from_file('election-tweets-2016')

        self.assertEqual(len(c), len(c.titles))
Esempio n. 31
0
    def test_corpus_from_file_just_text(self):
        c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.txt'))

        self.assertEqual(len(c), 9)

        self.assertEqual(len(c.domain), 0)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (9, 1))
Esempio n. 32
0
    def test_documents(self):
        c = Corpus.from_file('book-excerpts')
        docs = c.documents
        types = set(type(i) for i in docs)

        self.assertEqual(len(docs), len(c))
        self.assertEqual(len(types), 1)
        self.assertIn(str, types)
Esempio n. 33
0
 def test_empty_corpus(self):
     """
     Empty data.
     GH-247
     """
     corpus = Corpus.from_file("deerwester")[:0]
     vect = BowVectorizer(norm=BowVectorizer.L1)
     out = vect.transform(corpus)
     self.assertEqual(out, corpus)
Esempio n. 34
0
    def test_titles_no_numbers(self):
        """
        The case when no number is used since the title appears only once.
        """
        c = Corpus.from_file('andersen')
        c.set_title_variable(c.domain.metas[0])

        # title feature set
        self.assertEqual("The Little Match-Seller", c.titles[0])
Esempio n. 35
0
 def test_POSTagger(self):
     corpus = Corpus.from_file('deerwester')
     tagger = tag.AveragedPerceptronTagger()
     result = tagger.tag_corpus(corpus)
     self.assertTrue(hasattr(result, 'pos_tags'))
     # for token in itertools.chain(*result.tokens):
     #     self.assertRegexpMatches(token, '[a-z]+_[A-Z]+')
     for tokens, tags in zip(result.tokens, result.pos_tags):
         self.assertEqual(len(tokens), len(tags))
Esempio n. 36
0
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
Esempio n. 37
0
    def setUp(self):
        class ReverseStringTransformer(preprocess.BaseTransformer):
            name = 'reverse'

            def _preprocess(self, string):
                return string[::-1]

        self.transformer = ReverseStringTransformer()
        self.corpus = Corpus.from_file("deerwester")
Esempio n. 38
0
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
Esempio n. 39
0
 def test_empty_corpus(self):
     """
     Empty data.
     GH-247
     """
     corpus = Corpus.from_file("deerwester")[:0]
     vect = BowVectorizer(norm=BowVectorizer.L1)
     out = vect.transform(corpus)
     self.assertEqual(out, corpus)
Esempio n. 40
0
 def test_POSTagger(self):
     corpus = Corpus.from_file('deerwester')
     tagger = tag.AveragedPerceptronTagger()
     result = tagger.tag_corpus(corpus)
     self.assertTrue(hasattr(result, 'pos_tags'))
     # for token in itertools.chain(*result.tokens):
     #     self.assertRegexpMatches(token, '[a-z]+_[A-Z]+')
     for tokens, tags in zip(result.tokens, result.pos_tags):
         self.assertEqual(len(tokens), len(tags))
    def test_create_bow(self):
        corpus = Corpus.from_file('deerwester')
        bag_of_words = self.bow(corpus, use_tfidf=True)

        self.assertIsNotNone(bag_of_words.X)
        self.assertEqual(9, bag_of_words.X.shape[0])
        self.assertEqual(42, bag_of_words.X.shape[1])
        self.assertEqual(self.progress_callbacks, 4)
        self.assertEqual(self.error_callbacks, 0)
Esempio n. 42
0
    def test_init_preserve_shape_of_empty_x(self):
        c = Corpus.from_file('book-excerpts')
        d = c.domain
        new_domain = Domain((ContinuousVariable('c1'),), d.class_vars, d.metas)

        empty_X = csr_matrix((len(c), 1))
        new = Corpus(new_domain, X=empty_X, Y=c.Y, metas=c.metas)

        self.assertEqual(empty_X.nnz, 0)
        self.assertEqual(new.X.shape, empty_X.shape)
Esempio n. 43
0
    def test_compute_values_to_different_domain(self):
        destination = Corpus.from_file('andersen')

        self.assertFalse(self.corpus.domain.attributes)
        self.assertFalse(destination.domain.attributes)

        sentiment = self.method.transform(self.corpus)
        computed = destination.transform(sentiment.domain)

        self.assertTrue(sentiment.domain.attributes)
        self.assertEqual(sentiment.domain.attributes, computed.domain.attributes)
Esempio n. 44
0
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
                                  ngrams_range=(1, 3))
     pr(corpus, inplace=True)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
Esempio n. 45
0
    def test_documents_from_features(self):
        c = Corpus.from_file('book-excerpts')
        docs = c.documents_from_features([c.domain.class_var])
        types = set(type(i) for i in docs)

        self.assertTrue(all(
            [sum(cls in doc for cls in c.domain.class_var.values) == 1
             for doc in docs]))
        self.assertEqual(len(docs), len(c))
        self.assertEqual(len(types), 1)
        self.assertIn(str, types)
Esempio n. 46
0
 def test_corpus_remove_text_features(self):
     """
     Remove those text features which do not have a column in metas.
     GH-324
     GH-325
     """
     c = Corpus.from_file('deerwester')
     domain = Domain(attributes=c.domain.attributes, class_vars=c.domain.class_vars)
     d = c.transform(domain)
     self.assertFalse(len(d.text_features))
     # Make sure that copying works.
     d.copy()
Esempio n. 47
0
 def test_corpus_not_eq(self):
     c = Corpus.from_file('bookexcerpts')
     c2 = Corpus(c.documents[:-1], c.X, c.Y, c.metas, c.domain)
     self.assertNotEqual(c, c2)
     c2 = Corpus(c.documents, np.vstack((c.X, c.X)), c.Y, c.metas, c.domain)
     self.assertNotEqual(c, c2)
     c2 = Corpus(c.documents, c.X, np.vstack((c.Y, c.Y)), c.metas, c.domain)
     self.assertNotEqual(c, c2)
     c2 = Corpus(c.documents, c.X, c.Y, c.metas.T, c.domain)
     self.assertNotEqual(c, c2)
     broken_domain = Domain(c.domain.attributes, c.domain.class_var, [StringVariable('text2')])
     c2 = Corpus(c.documents, c.X, c.Y, c.metas, broken_domain)
     self.assertNotEqual(c, c2)
Esempio n. 48
0
    def test_extend_corpus(self):
        c = Corpus.from_file('book-excerpts')
        n_classes = len(c.domain.class_var.values)
        c_copy = c.copy()
        new_y = [c.domain.class_var.values[int(i)] for i in c.Y]
        new_y[0] = 'teenager'
        c.extend_corpus(c.metas, new_y)

        self.assertEqual(len(c), len(c_copy)*2)
        self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2)
        self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2)
        self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1])
        self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
Esempio n. 49
0
    def test_copy(self):
        corpus = Corpus.from_file('deerwester')

        p = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+\s}'))
        copied = corpus.copy()
        p(copied, inplace=True)
        self.assertIsNot(copied, corpus)
        self.assertNotEqual(copied, corpus)

        p(corpus, inplace=True)
        copied = corpus.copy()
        self.assertIsNot(copied, corpus)
        self.assertEqual(copied, corpus)
Esempio n. 50
0
    def test_domain(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertEqual(attrs, sorted(attrs))

        X = result.X.toarray()
        for i in range(len(corpus)):
            for contains, attr in zip(X[i], attrs):
                if contains > .001:
                    self.assertIn(attr, corpus.tokens[i])
Esempio n. 51
0
    def test_extend_corpus(self):
        c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.tab'))
        n_classes = len(c.domain.class_var.values)
        c_copy = c.copy()
        new_y = [c.domain.class_var.values[int(i)] for i in c.Y]
        new_y[0] = 'teenager'
        c.extend_corpus(c.documents, c.metas, new_y)

        self.assertEqual(len(c), len(c_copy)*2)
        self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2)
        self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2)
        self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1])
        self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
Esempio n. 52
0
 def open_file(self, path):
     self.Error.read_file.clear()
     self.used_attrs[:] = []
     self.unused_attrs[:] = []
     if path:
         try:
             self.corpus = Corpus.from_file(path)
             self.corpus.name = os.path.splitext(os.path.basename(path))[0]
             self.info_label.setText("Corpus of {} documents.".format(len(self.corpus)))
             self.used_attrs.extend(self.corpus.text_features)
             self.unused_attrs.extend([f for f in self.corpus.domain.metas
                                       if f.is_string and f not in self.corpus.text_features])
         except BaseException as err:
             self.Error.read_file(path, str(err))
    def test_variable_attributes(self):
        c1 = Corpus.from_file('deerwester')
        c2 = Corpus.from_file('deerwester')
        X = np.array([list(range(4)) for _ in range(len(c1))])
        X = sp.csr_matrix(X)

        dictionary = {
            0: 'd',
            1: 'c',
            2: 'b',
            3: 'a',
        }

        BaseVectorizer.add_features(c1, X, dictionary,
                                    compute_values=None, var_attrs=None)
        BaseVectorizer.add_features(c2, X, dictionary,
                                    compute_values=None, var_attrs={'foo': 1})

        n_attrs_before = len(c1.domain.attributes[0].attributes)
        n_attrs_after = len(c2.domain.attributes[0].attributes)
        self.assertTrue(n_attrs_after - n_attrs_before, 1)

        for a in c2.domain.attributes:
            self.assertIn('foo', a.attributes)
Esempio n. 54
0
    def test_extend_attributes(self):
        # corpus without features
        c = Corpus.from_file('book-excerpts')
        X = np.random.random((len(c), 3))
        c.extend_attributes(X, ['1', '2', '3'])
        self.assertEqual(c.X.shape, (len(c), 3))

        # add to non empty corpus
        c.extend_attributes(X, ['1', '2', '3'])
        self.assertEqual(c.X.shape, (len(c), 6))

        # extend sparse
        c.extend_attributes(csr_matrix(X), ['1', '2', '3'])
        self.assertEqual(c.X.shape, (len(c), 9))
        self.assertTrue(issparse(c.X))
Esempio n. 55
0
    def test_asserting_errors(self):
        c = Corpus.from_file('book-excerpts')

        with self.assertRaises(TypeError):
            Corpus(1.0, c.Y, c.metas, c.domain, c.text_features)

        too_large_x = np.vstack((c.X, c.X))
        with self.assertRaises(ValueError):
            Corpus(c.domain, too_large_x, c.Y, c.metas, c.W, c.text_features)

        with self.assertRaises(ValueError):
            c.set_text_features([StringVariable('foobar')])

        with self.assertRaises(ValueError):
            c.set_text_features([c.domain.metas[0], c.domain.metas[0]])
Esempio n. 56
0
    def open_file(self, path):
        self.error(1, '')
        self.used_attrs[:] = []
        self.unused_attrs[:] = []

        try:
            self.corpus = Corpus.from_file(path)
            for f in self.corpus.domain.metas:
                if f in self.corpus.text_features:
                    self.used_attrs.append(f)
                else:
                    self.unused_attrs.append(f)
            self.info_label.setText("Corpus of {} documents.".format(len(self.corpus)))
            self.send(Output.CORPUS, self.corpus)
        except BaseException as err:
            self.error(1, str(err))
Esempio n. 57
0
    def open_file(self, path):
        self.error(1, "")
        self.used_attrs[:] = []
        self.unused_attrs[:] = []

        try:
            self.corpus = Corpus.from_file(path)
            self.info_label.setText("Corpus of {} documents.".format(len(self.corpus)))
            self.used_attrs.extend(self.corpus.text_features)
            self.unused_attrs.extend(
                [
                    f
                    for f in chain(self.corpus.domain.variables, self.corpus.domain.metas)
                    if f not in self.corpus.text_features
                ]
            )
        except BaseException as err:
            self.error(1, str(err))
Esempio n. 58
0
    def test_ngrams_iter(self):
        c = Corpus.from_file('deerwester')
        c.ngram_range = (1, 1)
        self.assertEqual(list(c.ngrams), [doc.lower().split() for doc in c.documents])
        expected = [[(token.lower(), ) for token in doc.split()] for doc in c.documents]
        self.assertEqual(list(c.ngrams_iterator(join_with=None)), expected)
        c.ngram_range = (2, 3)

        expected_ngrams = [('machine', 'interface'), ('for', 'lab'),
                           ('machine', 'interface', 'for'), ('abc', 'computer', 'applications')]

        for ngram in expected_ngrams:
            self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0])
            self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0])

        self.pos_tagger.tag_corpus(c)
        c.ngram_range = (1, 1)
        for doc in c.ngrams_iterator(join_with='_', include_postags=True):
            for token in doc:
                self.assertRegexpMatches(token, '\w+_[A-Z]+')