def test_variable_attributes(self): c1 = Corpus.from_file('deerwester') c2 = Corpus.from_file('deerwester') X = np.array([list(range(4)) for _ in range(len(c1))]) X = sp.csr_matrix(X) dictionary = { 0: 'd', 1: 'c', 2: 'b', 3: 'a', } c1 = BaseVectorizer.add_features(c1, X, dictionary, compute_values=None, var_attrs=None) c2 = BaseVectorizer.add_features(c2, X, dictionary, compute_values=None, var_attrs={'foo': 1}) n_attrs_before = len(c1.domain.attributes[0].attributes) n_attrs_after = len(c2.domain.attributes[0].attributes) self.assertTrue(n_attrs_after - n_attrs_before, 1) for a in c2.domain.attributes: self.assertIn('foo', a.attributes)
def test_corpus_from_file_abs_path(self): c = Corpus.from_file('book-excerpts') path = os.path.dirname(__file__) file = os.path.abspath( os.path.join(path, '..', 'datasets', 'book-excerpts.tab')) c2 = Corpus.from_file(file) self.assertEqual(c, c2)
def test_corpus_from_file_abs_path(self): c = Corpus.from_file('book-excerpts') path = os.path.dirname(__file__) file = os.path.abspath( os.path.join(path, '..', 'datasets', 'book-excerpts.tab')) c2 = Corpus.from_file(file) np.testing.assert_array_equal(c.X, c2.X) np.testing.assert_array_equal(c.metas, c2.metas) self.assertEqual(c.documents, c2.documents)
def test_infer_text_features(self): c = Corpus.from_file('friends-transcripts') tf = c.text_features self.assertEqual(len(tf), 1) self.assertEqual(tf[0].name, 'Quote') c = Corpus.from_file('deerwester') tf = c.text_features self.assertEqual(len(tf), 1) self.assertEqual(tf[0].name, 'Text')
def test_compute_values_to_different_domain(self): source = Corpus.from_file('deerwester') destination = Corpus.from_file('book-excerpts') self.assertFalse(source.domain.attributes) self.assertFalse(destination.domain.attributes) bow = BowVectorizer().transform(source) computed = destination.transform(bow.domain) self.assertEqual(bow.domain.attributes, computed.domain.attributes)
def test_corpus_from_file(self): c = Corpus.from_file('book-excerpts') self.assertEqual(len(c), 140) self.assertEqual(len(c.domain), 1) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (140, 1)) c = Corpus.from_file('deerwester') self.assertEqual(len(c), 9) self.assertEqual(len(c.domain), 1) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (9, 1))
def setUp(self): self.corpus = Corpus.from_file("deerwester") self.pp_list = [preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), preprocess.SnowballStemmer(), preprocess.NGrams(), tag.AveragedPerceptronTagger()]
def test_documents_from_sparse_features(self): t = Table.from_file('brown-selected') c = Corpus.from_file('brown-selected') with c.unlocked(): c.X = csr_matrix(c.X) # docs from X, Y and metas docs = c.documents_from_features( [t.domain.attributes[0], t.domain.class_var, t.domain.metas[0]]) self.assertEqual(len(docs), len(t)) for first_attr, class_val, meta_attr, d in zip(t.X[:, 0], c.Y, c.metas[:, 0], docs): first_attr = c.domain.attributes[0].str_val(first_attr) class_val = c.domain.class_var.str_val(class_val) meta_attr = c.domain.metas[0].str_val(meta_attr) self.assertIn(class_val, d) self.assertIn(first_attr, d) self.assertIn(meta_attr, d) # docs only from sparse X docs = c.documents_from_features([t.domain.attributes[0]]) self.assertEqual(len(docs), len(t)) for first_attr, d in zip(t.X[:, 0], docs): first_attr = c.domain.attributes[0].str_val(first_attr) self.assertIn(first_attr, d)
def test_corpus_from_init(self): c = Corpus.from_file('book-excerpts') with self.assertWarns(FutureWarning): c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, c.text_features) np.testing.assert_array_equal(c.X, c2.X) np.testing.assert_array_equal(c.metas, c2.metas) self.assertEqual(c.documents, c2.documents)
def test_corpus_from_file(self): c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.txt')) self.assertEqual(len(c), 140) self.assertEqual(len(c.domain), 0) self.assertEqual(len(c.domain.metas), 2) self.assertEqual(c.metas.shape, (140, 2))
def test_corpus_from_file_just_text(self): c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.tab')) self.assertEqual(len(c), 9) self.assertEqual(len(c.domain), 0) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (9, 1))
def open_file(self, path=None, data=None): self.closeContext() self.Error.clear() self.unused_attrs_model[:] = [] self.used_attrs_model[:] = [] if data: self.corpus = Corpus.from_table(data.domain, data) elif path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] except BaseException as err: self.Error.read_file(path, str(err)) else: return self.update_info() self.used_attrs = list(self.corpus.text_features) if not self.corpus.text_features: self.Error.corpus_without_text_features() self.Outputs.corpus.send(None) return self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend([ f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model ])
def test_preprocess(self): pr = preprocess.Preprocessor( tokenizer=preprocess.RegexpTokenizer('\w+'), pos_tagger=tag.AveragedPerceptronTagger()) corpus = Corpus.from_file('deerwester') pr(corpus, inplace=True) self.assertIsNotNone(corpus.pos_tags)
def test_titles(self): c = Corpus.from_file('book-excerpts') # no title feature set titles = c.titles self.assertEqual(len(titles), len(c)) for title in titles: self.assertIn('Document ', title) # title feature set c.set_title_variable(c.domain[0]) titles = c.titles self.assertEqual(len(titles), len(c)) # first 50 are children for title, c in zip(titles[:50], range(1, 51)): self.assertEqual(f"children ({c})", title) # others are adults for title, a in zip(titles[50:100], range(1, 51)): self.assertEqual(f"adult ({a})", title) # first 50 are children for title, c in zip(titles[100:120], range(51, 71)): self.assertEqual(f"children ({c})", title) # others are adults for title, a in zip(titles[120:140], range(51, 71)): self.assertEqual(f"adult ({a})", title)
def test_corpus_not_eq(self): c = Corpus.from_file('book-excerpts') n_doc = c.X.shape[0] c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, []) self.assertNotEqual(c, c2) c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features) self.assertNotEqual(c, c2) c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features) self.assertNotEqual(c, c2) broken_metas = np.copy(c.metas) broken_metas[0, 0] = '' c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features) self.assertNotEqual(c, c2) new_meta = [StringVariable('text2')] broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta) c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta) self.assertNotEqual(c, c2) c2 = c.copy() c2.ngram_range = (2, 4) self.assertNotEqual(c, c2)
def open_file(self, path=None, data=None): self.closeContext() self.Error.clear() self.unused_attrs_model[:] = [] self.used_attrs_model[:] = [] if data: self.corpus = Corpus.from_table(data.domain, data) elif path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] except BaseException as err: self.Error.read_file(path, str(err)) else: return self.update_info() self.used_attrs = list(self.corpus.text_features) if not self.corpus.text_features: self.Error.corpus_without_text_features() self.Outputs.corpus.send(None) return self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend( [f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model])
def test_extend_attributes(self): """ Test correctness of extending attributes, variables must have unique values and must not happen inplace """ # corpus without features c = Corpus.from_file('book-excerpts') X = np.random.random((len(c), 3)) new_c = c.extend_attributes(X, ['1', '2', '3']) self.assertEqual(new_c.X.shape, (len(c), 3)) # add to non empty corpus new_c = new_c.extend_attributes(X, ['1', '2', '4']) self.assertEqual(new_c.X.shape, (len(c), 6)) self.assertListEqual([a.name for a in new_c.domain.attributes], ['1', '2', '3', '1 (1)', '2 (1)', '4']) self.assertEqual(0, len(c.domain.attributes)) # extend sparse new_c = new_c.extend_attributes(csr_matrix(X), ['1', '2', '3']) self.assertEqual(new_c.X.shape, (len(c), 9)) self.assertTrue(issparse(new_c.X)) self.assertListEqual( [a.name for a in new_c.domain.attributes], ['1', '2', '3', '1 (1)', '2 (1)', '4', '1 (2)', '2 (2)', '3 (1)']) self.assertEqual(0, len(c.domain.attributes))
def test_transform(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertIsInstance(result, Corpus) self.assertEqual(len(result.domain.variables), 43)
def test_args(self): corpus = Corpus.from_file('deerwester') BowVectorizer.wglobals['const'] = lambda df, N: 1 vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.COUNT, wglobal='const') self.assertEqualCorpus( vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.BINARY, wglobal='const') self.assertEqualCorpus( vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.L1, wlocal=BowVectorizer.COUNT, wglobal='const') x = vect.transform(corpus).X self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)
def test_transform(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertIsInstance(result, Corpus) self.assertEqual(len(result.domain), 43)
def test_pp_documents(self): c = Corpus.from_file('book-excerpts') self.assertEqual(c.documents, c.pp_documents) pp_c = preprocess.BASE_TRANSFORMER(c) self.assertEqual(c.documents, pp_c.documents) self.assertNotEqual(c.pp_documents, pp_c.pp_documents)
def test_corpus_from_file(self): dd_before = dataset_dirs.copy() c = Corpus.from_file('book-excerpts') # from_file temporarily change dataset_dirs # test that dataset_dirs remains unchanged after from_file call self.assertListEqual(dd_before, dataset_dirs) self.assertEqual(len(c), 140) self.assertEqual(len(c.domain.variables), 1) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (140, 1)) c = Corpus.from_file('deerwester') self.assertEqual(len(c), 9) self.assertEqual(len(c.domain.variables), 1) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (9, 1))
def test_documents(self): c = Corpus.from_file('book-excerpts') docs = c.documents types = set(type(i) for i in docs) self.assertEqual(len(docs), len(c)) self.assertEqual(len(types), 1) self.assertIn(str, types)
def test_reset_pos_tags(self): corpus = Corpus.from_file('deerwester') tagger = tag.AveragedPerceptronTagger() tagged_corpus = tagger(corpus) self.assertTrue(len(tagged_corpus.pos_tags)) tokenizer = preprocess.RegexpTokenizer(pattern=r'\w') tokenized_corpus = tokenizer(corpus) self.assertFalse(tokenized_corpus.pos_tags)
def test_languages(self): corpus = Corpus.from_file('deerwester') self.assertIsNone(corpus.languages) corpus.detect_languages() self.assertEqual(len(corpus.languages), len(corpus)) self.assertListEqual(corpus.languages, ['en' for _ in range(len(corpus))])
def test_titles_read_document(self): """ When we read the document with a title marked it should have titles set correctly. """ c = Corpus.from_file('election-tweets-2016') self.assertEqual(len(c), len(c.titles))
def test_corpus_from_file_just_text(self): c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.txt')) self.assertEqual(len(c), 9) self.assertEqual(len(c.domain), 0) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (9, 1))
def test_empty_corpus(self): """ Empty data. GH-247 """ corpus = Corpus.from_file("deerwester")[:0] vect = BowVectorizer(norm=BowVectorizer.L1) out = vect.transform(corpus) self.assertEqual(out, corpus)
def test_titles_no_numbers(self): """ The case when no number is used since the title appears only once. """ c = Corpus.from_file('andersen') c.set_title_variable(c.domain.metas[0]) # title feature set self.assertEqual("The Little Match-Seller", c.titles[0])
def test_POSTagger(self): corpus = Corpus.from_file('deerwester') tagger = tag.AveragedPerceptronTagger() result = tagger.tag_corpus(corpus) self.assertTrue(hasattr(result, 'pos_tags')) # for token in itertools.chain(*result.tokens): # self.assertRegexpMatches(token, '[a-z]+_[A-Z]+') for tokens, tags in zip(result.tokens, result.pos_tags): self.assertEqual(len(tokens), len(tags))
def test_compute_values(self): corpus = Corpus.from_file('deerwester') vect = BowVectorizer() bow = vect.transform(corpus) computed = Corpus.from_table(bow.domain, corpus) self.assertEqual(bow.domain, computed.domain) self.assertEqual((bow.X != computed.X).nnz, 0)
def setUp(self): class ReverseStringTransformer(preprocess.BaseTransformer): name = 'reverse' def _preprocess(self, string): return string[::-1] self.transformer = ReverseStringTransformer() self.corpus = Corpus.from_file("deerwester")
def test_create_bow(self): corpus = Corpus.from_file('deerwester') bag_of_words = self.bow(corpus, use_tfidf=True) self.assertIsNotNone(bag_of_words.X) self.assertEqual(9, bag_of_words.X.shape[0]) self.assertEqual(42, bag_of_words.X.shape[1]) self.assertEqual(self.progress_callbacks, 4) self.assertEqual(self.error_callbacks, 0)
def test_init_preserve_shape_of_empty_x(self): c = Corpus.from_file('book-excerpts') d = c.domain new_domain = Domain((ContinuousVariable('c1'),), d.class_vars, d.metas) empty_X = csr_matrix((len(c), 1)) new = Corpus(new_domain, X=empty_X, Y=c.Y, metas=c.metas) self.assertEqual(empty_X.nnz, 0) self.assertEqual(new.X.shape, empty_X.shape)
def test_compute_values_to_different_domain(self): destination = Corpus.from_file('andersen') self.assertFalse(self.corpus.domain.attributes) self.assertFalse(destination.domain.attributes) sentiment = self.method.transform(self.corpus) computed = destination.transform(sentiment.domain) self.assertTrue(sentiment.domain.attributes) self.assertEqual(sentiment.domain.attributes, computed.domain.attributes)
def test_ngrams(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'), ngrams_range=(1, 3)) pr(corpus, inplace=True) result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertIn(corpus.tokens[0][1], attrs) self.assertIn(' '.join(corpus.tokens[0][:2]), attrs) self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
def test_documents_from_features(self): c = Corpus.from_file('book-excerpts') docs = c.documents_from_features([c.domain.class_var]) types = set(type(i) for i in docs) self.assertTrue(all( [sum(cls in doc for cls in c.domain.class_var.values) == 1 for doc in docs])) self.assertEqual(len(docs), len(c)) self.assertEqual(len(types), 1) self.assertIn(str, types)
def test_corpus_remove_text_features(self): """ Remove those text features which do not have a column in metas. GH-324 GH-325 """ c = Corpus.from_file('deerwester') domain = Domain(attributes=c.domain.attributes, class_vars=c.domain.class_vars) d = c.transform(domain) self.assertFalse(len(d.text_features)) # Make sure that copying works. d.copy()
def test_corpus_not_eq(self): c = Corpus.from_file('bookexcerpts') c2 = Corpus(c.documents[:-1], c.X, c.Y, c.metas, c.domain) self.assertNotEqual(c, c2) c2 = Corpus(c.documents, np.vstack((c.X, c.X)), c.Y, c.metas, c.domain) self.assertNotEqual(c, c2) c2 = Corpus(c.documents, c.X, np.vstack((c.Y, c.Y)), c.metas, c.domain) self.assertNotEqual(c, c2) c2 = Corpus(c.documents, c.X, c.Y, c.metas.T, c.domain) self.assertNotEqual(c, c2) broken_domain = Domain(c.domain.attributes, c.domain.class_var, [StringVariable('text2')]) c2 = Corpus(c.documents, c.X, c.Y, c.metas, broken_domain) self.assertNotEqual(c, c2)
def test_extend_corpus(self): c = Corpus.from_file('book-excerpts') n_classes = len(c.domain.class_var.values) c_copy = c.copy() new_y = [c.domain.class_var.values[int(i)] for i in c.Y] new_y[0] = 'teenager' c.extend_corpus(c.metas, new_y) self.assertEqual(len(c), len(c_copy)*2) self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2) self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2) self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1]) self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
def test_copy(self): corpus = Corpus.from_file('deerwester') p = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+\s}')) copied = corpus.copy() p(copied, inplace=True) self.assertIsNot(copied, corpus) self.assertNotEqual(copied, corpus) p(corpus, inplace=True) copied = corpus.copy() self.assertIsNot(copied, corpus) self.assertEqual(copied, corpus)
def test_domain(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertEqual(attrs, sorted(attrs)) X = result.X.toarray() for i in range(len(corpus)): for contains, attr in zip(X[i], attrs): if contains > .001: self.assertIn(attr, corpus.tokens[i])
def test_extend_corpus(self): c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.tab')) n_classes = len(c.domain.class_var.values) c_copy = c.copy() new_y = [c.domain.class_var.values[int(i)] for i in c.Y] new_y[0] = 'teenager' c.extend_corpus(c.documents, c.metas, new_y) self.assertEqual(len(c), len(c_copy)*2) self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2) self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2) self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1]) self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
def open_file(self, path): self.Error.read_file.clear() self.used_attrs[:] = [] self.unused_attrs[:] = [] if path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] self.info_label.setText("Corpus of {} documents.".format(len(self.corpus))) self.used_attrs.extend(self.corpus.text_features) self.unused_attrs.extend([f for f in self.corpus.domain.metas if f.is_string and f not in self.corpus.text_features]) except BaseException as err: self.Error.read_file(path, str(err))
def test_variable_attributes(self): c1 = Corpus.from_file('deerwester') c2 = Corpus.from_file('deerwester') X = np.array([list(range(4)) for _ in range(len(c1))]) X = sp.csr_matrix(X) dictionary = { 0: 'd', 1: 'c', 2: 'b', 3: 'a', } BaseVectorizer.add_features(c1, X, dictionary, compute_values=None, var_attrs=None) BaseVectorizer.add_features(c2, X, dictionary, compute_values=None, var_attrs={'foo': 1}) n_attrs_before = len(c1.domain.attributes[0].attributes) n_attrs_after = len(c2.domain.attributes[0].attributes) self.assertTrue(n_attrs_after - n_attrs_before, 1) for a in c2.domain.attributes: self.assertIn('foo', a.attributes)
def test_extend_attributes(self): # corpus without features c = Corpus.from_file('book-excerpts') X = np.random.random((len(c), 3)) c.extend_attributes(X, ['1', '2', '3']) self.assertEqual(c.X.shape, (len(c), 3)) # add to non empty corpus c.extend_attributes(X, ['1', '2', '3']) self.assertEqual(c.X.shape, (len(c), 6)) # extend sparse c.extend_attributes(csr_matrix(X), ['1', '2', '3']) self.assertEqual(c.X.shape, (len(c), 9)) self.assertTrue(issparse(c.X))
def test_asserting_errors(self): c = Corpus.from_file('book-excerpts') with self.assertRaises(TypeError): Corpus(1.0, c.Y, c.metas, c.domain, c.text_features) too_large_x = np.vstack((c.X, c.X)) with self.assertRaises(ValueError): Corpus(c.domain, too_large_x, c.Y, c.metas, c.W, c.text_features) with self.assertRaises(ValueError): c.set_text_features([StringVariable('foobar')]) with self.assertRaises(ValueError): c.set_text_features([c.domain.metas[0], c.domain.metas[0]])
def open_file(self, path): self.error(1, '') self.used_attrs[:] = [] self.unused_attrs[:] = [] try: self.corpus = Corpus.from_file(path) for f in self.corpus.domain.metas: if f in self.corpus.text_features: self.used_attrs.append(f) else: self.unused_attrs.append(f) self.info_label.setText("Corpus of {} documents.".format(len(self.corpus))) self.send(Output.CORPUS, self.corpus) except BaseException as err: self.error(1, str(err))
def open_file(self, path): self.error(1, "") self.used_attrs[:] = [] self.unused_attrs[:] = [] try: self.corpus = Corpus.from_file(path) self.info_label.setText("Corpus of {} documents.".format(len(self.corpus))) self.used_attrs.extend(self.corpus.text_features) self.unused_attrs.extend( [ f for f in chain(self.corpus.domain.variables, self.corpus.domain.metas) if f not in self.corpus.text_features ] ) except BaseException as err: self.error(1, str(err))
def test_ngrams_iter(self): c = Corpus.from_file('deerwester') c.ngram_range = (1, 1) self.assertEqual(list(c.ngrams), [doc.lower().split() for doc in c.documents]) expected = [[(token.lower(), ) for token in doc.split()] for doc in c.documents] self.assertEqual(list(c.ngrams_iterator(join_with=None)), expected) c.ngram_range = (2, 3) expected_ngrams = [('machine', 'interface'), ('for', 'lab'), ('machine', 'interface', 'for'), ('abc', 'computer', 'applications')] for ngram in expected_ngrams: self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0]) self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0]) self.pos_tagger.tag_corpus(c) c.ngram_range = (1, 1) for doc in c.ngrams_iterator(join_with='_', include_postags=True): for token in doc: self.assertRegexpMatches(token, '\w+_[A-Z]+')