def test_corpus_not_eq(self): c = Corpus.from_file('book-excerpts') n_doc = c.X.shape[0] c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, []) self.assertNotEqual(c, c2) c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features) self.assertNotEqual(c, c2) c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features) self.assertNotEqual(c, c2) broken_metas = np.copy(c.metas) broken_metas[0, 0] = '' c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features) self.assertNotEqual(c, c2) new_meta = [StringVariable('text2')] broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta) c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta) self.assertNotEqual(c, c2) c2 = c.copy() c2.ngram_range = (2, 4) self.assertNotEqual(c, c2)
def test_asserting_errors(self): c = Corpus.from_file('book-excerpts') with self.assertRaises(TypeError): Corpus(1.0, c.Y, c.metas, c.domain, c.text_features) too_large_x = np.vstack((c.X, c.X)) with self.assertRaises(ValueError): Corpus(c.domain, too_large_x, c.Y, c.metas, c.W, c.text_features) with self.assertRaises(ValueError): c.set_text_features([StringVariable('foobar')]) with self.assertRaises(ValueError): c.set_text_features([c.domain.metas[0], c.domain.metas[0]])
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata) meta_vars = [] for field_name, _ in includes_metadata: if field_name == 'pub_date': meta_vars.append(TimeVariable(field_name)) else: meta_vars.append(StringVariable.make(field_name)) class_vars = [ DiscreteVariable('section_name', values=list(set(class_values))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def _create_corpus(self): corpus = None names = ["name", "path", "content"] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: data.append([textdata.name, textdata.path, textdata.content]) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain([], category_var, [StringVariable.make(name) for name in names]) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus(domain, Y=category_data, metas=data, text_features=[domain.metas[2]]) return corpus
def test_corpus_from_init(self): c = Corpus.from_file('book-excerpts') with self.assertWarns(FutureWarning): c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, c.text_features) np.testing.assert_array_equal(c.X, c2.X) np.testing.assert_array_equal(c.metas, c2.metas) self.assertEqual(c.documents, c2.documents)
def _generate_corpus(records, required_text_fields): """ Generates a corpus from the input NYT records. :param records: The input NYT records. :type records: list :param required_text_fields: A list of the available NYT text fields. :type required_text_fields: list :return: :class: `orangecontrib.text.corpus.Corpus` """ metas, class_values = _parse_record_json(records, required_text_fields) # Create domain. meta_vars = [StringVariable.make(field) for field in required_text_fields] meta_vars += [ StringVariable.make("pub_date"), StringVariable.make("country") ] class_vars = [ DiscreteVariable("section_name", values=list(set(class_values))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(None, Y, metas, domain, meta_vars) # used all features
def main(): from Orange.data import Table, Domain, ContinuousVariable, StringVariable words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home' words = np.array([w.replace('~', ' ') for w in words.split()], dtype=object, ndmin=2).T weights = np.random.random((len(words), 1)) data = np.zeros((len(words), 0)) metas = [] for i, w in enumerate(weights.T): data = np.column_stack((data, words, w)) metas = metas + [ StringVariable('Topic' + str(i)), ContinuousVariable('weights') ] domain = Domain([], metas=metas) table = Table.from_numpy(domain, X=np.zeros((len(words), 0)), metas=data) app = QApplication(['']) w = OWWordCloud() w.on_topic_change(table) domain = Domain([], metas=[StringVariable('text')]) data = Corpus(domain=domain, metas=np.array([[' '.join(words.flat)]])) # data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]])) w.on_corpus_change(data) w.handleNewSignals() w.show() app.exec()
def handle_languages(self): if self.corpus is not None: domain = self.corpus.domain if self.detect_languages: if self.corpus.languages is None: self.corpus.detect_languages() curr_attributes = list(domain.attributes) curr_class_var = [domain.class_var] if domain.class_var else [] curr_metas = list(domain.metas) curr_variables = curr_attributes + curr_class_var + curr_metas curr_names = [var.name for var in curr_variables] new_name = get_unique_names(curr_names, "Language") variable_attrs = {'language-feature': True} new_variable = StringVariable(new_name) new_variable.attributes.update(variable_attrs) new_domain = Domain(attributes=domain.attributes, class_vars=domain.class_var, metas=list(domain.metas) + [new_variable]) metas = np.hstack([ self.corpus.metas, np.array(self.corpus.languages).reshape(-1, 1) ]) self.corpus = Corpus(new_domain, self.corpus.X.copy(), self.corpus.Y.copy(), metas, self.corpus.W.copy(), copy(self.corpus.text_features)) else: lang_feat_idx = None for i, f in enumerate(domain.metas): if ('language-feature' in f.attributes and f.attributes['language-feature']): lang_feat_idx = i break if lang_feat_idx is not None: new_domain = Domain(attributes=domain.attributes, class_vars=domain.class_var, metas=list( np.delete(list(domain.metas), lang_feat_idx))) self.corpus = Corpus( new_domain, self.corpus.X.copy(), self.corpus.Y.copy(), np.delete(self.corpus.metas, lang_feat_idx, axis=1), self.corpus.W.copy(), copy(self.corpus.text_features)) self.Outputs.corpus.send(self.corpus)
def test_init_preserve_shape_of_empty_x(self): c = Corpus.from_file('book-excerpts') d = c.domain new_domain = Domain((ContinuousVariable('c1'),), d.class_vars, d.metas) empty_X = csr_matrix((len(c), 1)) new = Corpus(new_domain, X=empty_X, Y=c.Y, metas=c.metas) self.assertEqual(empty_X.nnz, 0) self.assertEqual(new.X.shape, empty_X.shape)
def get_data(self): domain = Domain([], metas=[StringVariable("Conc. {}".format( self.word)), StringVariable("Document")]) data = [] docs = [] for row in range(self.rowCount()): txt = [] for column in range(self.columnCount()): index = self.index(row, column) txt.append(str(self.data(index))) data.append([" ".join(txt)]) docs.append([self.corpus.titles[self.word_index[row][0]]]) conc = np.array(np.hstack((data, docs)), dtype=object) return Corpus(domain, metas=conc, text_features=[domain.metas[0]])
def _create_corpus(self) -> Corpus: corpus = None names = ["name", "path", "content"] if not self.is_conllu else [ "name", "path", "utterance", "content" ] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: datum = [ # some characters are written as decomposed (č is char c # and separate char for caron), with NFC normalization we # normalize them to be written as precomposed (č is one # unicode char - 0x10D) # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize normalize('NFC', textdata.name), normalize('NFC', textdata.path), normalize('NFC', textdata.content) ] if self.is_conllu: datum.insert(2, normalize('NFC', textdata.doc_id)) data.append(datum) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain([], category_var, [StringVariable.make(name) for name in names]) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus(domain, Y=category_data, metas=data, text_features=[domain.metas[-1]]) return corpus
def test_titles(self): c = Corpus.from_file('book-excerpts') # no title feature set titles = c.titles self.assertEqual(len(titles), len(c)) for title in titles: self.assertIn('Document ', title) # inferred title from heuristics expected = list(map(str, range(len(c)))) c2 = Corpus(Domain([], [], (StringVariable('heading'), )), None, None, np.c_[expected]) titles = c2.titles self.assertEqual(titles, expected) # title feature set c.domain[0].attributes['title'] = True titles = c.titles self.assertEqual(len(titles), len(c)) for title in titles: self.assertIn(title, c.domain.class_var.values)
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_vars = [] time_var = None for field_name, _ in includes_metadata: if field_name == PUBMED_FIELD_DATE: time_var = TimeVariable(field_name) meta_vars.append(time_var) else: meta_vars.append(StringVariable.make(field_name)) if field_name == PUBMED_FIELD_TITLE: meta_vars[-1].attributes["title"] = True meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata, time_var=time_var, ) class_vars = [ DiscreteVariable('section', values=list(map(str, set(filter(None, class_values))))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def run_initial_query(self): self.warning(1) self.error(1) # Only execute if the NYT object is present(safety lock). # Otherwise this method cannot be called anyway. if self.nyt_api: # Query keywords. qkw = self.query_combo.currentText() # Text fields. text_includes_params = [ self.includes_headline, self.includes_lead_paragraph, self.includes_snippet, self.includes_abstract, self.includes_keywords ] if True not in text_includes_params: self.warning(1, "You must select at least one text field.") return # Set the query url. self.nyt_api.set_query_url(qkw, self.year_from, self.year_to, text_includes_params) # Execute the query. res, cached, error = self.nyt_api.execute_query(0) if res: # Construct a corpus for the output. documents, metas, meta_vars, class_values = parse_record_json( res, text_includes_params) class_vars = [ DiscreteVariable("section_name", values=list(set(class_values))) ] Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] Y[np.isnan(Y)] = 0 domain = Domain([], class_vars=class_vars, metas=meta_vars) self.output_corpus = Corpus(documents, None, Y, metas, domain) self.send(Output.CORPUS, self.output_corpus) # Update the response info. self.all_hits = res["response"]["meta"]["hits"] self.num_retrieved = len(res["response"]["docs"]) info_label = "Records: {}\nRetrieved: {}".format( self.all_hits, self.num_retrieved) if self.all_hits > 1000: info_label += " (max 1000)" self.query_info_label.setText(info_label) # Enable 'retrieve remaining' button. if self.num_retrieved < min(self.all_hits, 1000): self.retrieve_other_button.setText( 'Retrieve remaining records ({})'.format( min(self.all_hits, 1000) - self.num_retrieved)) self.retrieve_other_button.setEnabled(True) self.retrieve_other_button.setFocus() else: self.retrieve_other_button.setText('All records retrieved') self.retrieve_other_button.setEnabled(False) # Add the query to history. if qkw not in self.recent_queries: self.recent_queries.insert(0, qkw) else: if error: if isinstance(error, HTTPError): self.error( 1, "An error occurred(HTTP {})".format(error.code)) elif isinstance(error, URLError): self.error( 1, "An error occurred(URL {})".format(error.reason))
def test_corpus_from_init(self): c = Corpus.from_file('book-excerpts') c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.text_features) self.assertEqual(c, c2)
class BowVectorizationTest(unittest.TestCase): def test_transform(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertIsInstance(result, Corpus) self.assertEqual(len(result.domain.variables), 43) def test_binary(self): vect = BowVectorizer(wlocal=BowVectorizer.BINARY) corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertEqual(result.X.max(), 1.) def test_empty_tokens(self): corpus = Corpus.from_file('deerwester') corpus.text_features = [] bag_of_words = BowVectorizer().transform(corpus, copy=False) self.assertIs(corpus, bag_of_words) def test_domain(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertEqual(attrs, sorted(attrs)) X = result.X.toarray() for i in range(len(corpus)): for contains, attr in zip(X[i], attrs): if contains > .001: self.assertIn(attr, corpus.tokens[i]) def test_ngrams(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') corpus = preprocess.RegexpTokenizer('\w+')(corpus) corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus) result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertIn(corpus.tokens[0][1], attrs) self.assertIn(' '.join(corpus.tokens[0][:2]), attrs) self.assertIn(' '.join(corpus.tokens[0][:3]), attrs) def test_report(self): vect = BowVectorizer() self.assertGreater(len(vect.report()), 0) def test_args(self): corpus = Corpus.from_file('deerwester') BowVectorizer.wglobals['const'] = lambda df, N: 1 vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.COUNT, wglobal='const') self.assertEqualCorpus( vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.BINARY, wglobal='const') self.assertEqualCorpus( vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.L1, wlocal=BowVectorizer.COUNT, wglobal='const') x = vect.transform(corpus).X self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0) def test_compute_values(self): corpus = Corpus.from_file('deerwester') vect = BowVectorizer() bow = vect.transform(corpus) computed = Corpus.from_table(bow.domain, corpus) self.assertEqual(bow.domain, computed.domain) self.assertEqual((bow.X != computed.X).nnz, 0) def test_compute_values_to_different_domain(self): source = Corpus.from_file('deerwester') destination = Corpus.from_file('book-excerpts') self.assertFalse(source.domain.attributes) self.assertFalse(destination.domain.attributes) bow = BowVectorizer().transform(source) computed = destination.transform(bow.domain) self.assertEqual(bow.domain.attributes, computed.domain.attributes) def assertEqualCorpus(self, first, second, msg=None): np.testing.assert_allclose(first.X.todense(), second.X.todense(), err_msg=msg) def test_empty_corpus(self): """ Empty data. GH-247 """ corpus = Corpus.from_file("deerwester")[:0] vect = BowVectorizer(norm=BowVectorizer.L1) out = vect.transform(corpus) self.assertEqual(out, corpus) def tests_duplicated_names(self): """ BOW adds words to the domain and if same attribute name already appear in the domain it renames it and add number to the existing attribute name """ corpus = Corpus.from_file("deerwester") corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["human"]) corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["testtest"]) vect = BowVectorizer() out = vect.transform(corpus) # first attribute is in the dataset before bow and should be renamed self.assertEqual("human (1)", out.domain[0].name) self.assertEqual("testtest", out.domain[1].name) # all attributes from [1:] are are bow attributes and should include # human self.assertIn("human", [v.name for v in out.domain.attributes[1:]]) def test_compute_values_same_tfidf_regardless_num_documents(self): """ When computing TF-IDF from compute values TF-IDF should give same results regardless of length of new corpus - IDF weighting should consider only counts from original corpus. """ corpus = Corpus.from_file('deerwester') train_corpus = corpus[:5] test_corpus = corpus[5:] vect = BowVectorizer(wglobal=BowVectorizer.IDF) bow = vect.transform(train_corpus) computed1 = Corpus.from_table(bow.domain, test_corpus[1:]) computed2 = Corpus.from_table(bow.domain, test_corpus) self.assertEqual(computed1.domain, computed2.domain) self.assertEqual(bow.domain, computed2.domain) self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0) # fmt: off domain = Domain([], metas=[StringVariable("text")]) small_corpus_train = Corpus(domain, np.empty((4, 0)), metas=np.array([["this is a nice day day"], ["the day is nice"], ["i love a beautiful day"], ["this apple is mine"]])) terms = [ "this", "is", "a", "nice", "day", "the", "i", "love", "beautiful", "apple", "mine" ] train_counts = np.array([[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]]) small_corpus_test = Corpus(domain, np.empty((3, 0)), metas=np.array([ ["this is a nice day day"], ["day nice summer mine"], ["apple is cool"], ])) test_counts = np.array([[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]]) # fmt: on def assert_bow_same(self, corpus, values, terms): self.assertSetEqual(set(terms), set(a.name for a in corpus.domain.attributes)) for i, a in enumerate(terms): self.assertListEqual( corpus.get_column_view(a)[0].tolist(), values[:, i].tolist(), f"BOW differ for term {a}", ) def test_count_correctness(self): """Test if computed counts are correct for train and test dataset""" bow = BowVectorizer().transform(self.small_corpus_train) self.assert_bow_same(bow, self.train_counts, self.terms) # computed from compute_values - result contains only terms from train dataset bow_test = Corpus.from_table(bow.domain, self.small_corpus_test) self.assert_bow_same(bow_test, self.test_counts, self.terms) def test_tfidf_correctness(self): """ Test if computed tf-ids are correct for train and test dataset When computing tf-idf on the training dataset (from compute values) weights (idf) must be computed based on numbers on training dataset """ bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform( self.small_corpus_train) document_appearance = (self.train_counts != 0).sum(0) n = len(self.train_counts) idfs_train = self.train_counts * np.log(n / document_appearance) self.assert_bow_same(bow, idfs_train, self.terms) bow_test = Corpus.from_table(bow.domain, self.small_corpus_test) # weights computed based on numbers from training dataset idfs_test = self.test_counts * np.log(n / document_appearance) self.assert_bow_same(bow_test, idfs_test, self.terms)
from orangecontrib.text.corpus import Corpus CUT_VALUE = 10 """ Remove columns that do not have any value above CUT_VALUE """ print("Num values in original data:", len(in_data.X.data)) print("Num attributes in original data:", len(in_data.domain.attributes)) column_max = in_data.X.max(axis=0).toarray().flatten() attributes_mask = column_max > CUT_VALUE out_data = Corpus(Domain( [a for a, inc in zip(in_data.domain.attributes, attributes_mask) if inc], in_data.domain.class_var, in_data.domain.metas), in_data.X[:, attributes_mask], Y=in_data.Y, metas=in_data.metas, text_features=in_data.text_features) print("Num values after removing columns:", len(out_data.X.data)) """ This part is optional: Remove values that are not above CUT_VALUE """ cx = out_data.X.tocoo() for i, j, v in zip(cx.row, cx.col, cx.data): if v <= CUT_VALUE: out_data.X[i, j] = 0 out_data.X.eliminate_zeros()
def sendData(self): """Convert input(s) and send output""" if not (self.segmentation or self.corpus): self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Textable segmentation', None, self) self.send('Text Mining corpus', None) return msg_seg = msg_corpus = "" num_iterations = 0 if self.corpus: num_iterations += len(self.corpus) if self.segmentation: num_iterations += len(self.segmentation) self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_iterations) # Convert corpus to segmentation... if self.corpus: self.clearCreatedInputs() new_segments = list() text_feature = self.corpus.text_features[self.segmentContent] for row in self.corpus: content = row[text_feature].value if content == "": continue new_input = Input(row[text_feature].value) new_segment_annotations = dict() for attr in self.corpus.domain: attr_str = str(row[attr]) if attr_str != "?": new_segment_annotations[str(attr)] = attr_str for meta_attr in self.corpus.domain.metas: meta_attr_str = str(row[meta_attr]) if (meta_attr != text_feature and meta_attr_str != "?"): new_segment_annotations[str(meta_attr)] = meta_attr_str new_segments.append( Segment(new_input[0].str_index, new_input[0].start, new_input[0].end, new_segment_annotations)) self.createdInputs.append(new_input) progressBar.advance() new_segmentation = Segmentation(new_segments, self.captionTitle) msg_seg = u'%i segment@p' % len(new_segmentation) msg_seg = pluralize(msg_seg, len(new_segmentation)) self.send('Textable segmentation', new_segmentation, self) else: self.send('Textable segmentation', None, self) # Convert segmentation to corpus... if self.segmentation: metas = list() attributes = list() meta_keys = list() attribute_keys = list() for key in self.segmentation.get_annotation_keys(): possible_values = set() for segment in self.segmentation: try: possible_values.add(str(segment.annotations[key])) except KeyError: pass if (self.limitNumCategories and len(possible_values) > self.maxNumCategories): metas.append(StringVariable(key)) meta_keys.append(key) else: attributes.append( DiscreteVariable(key, values=list(possible_values))) attribute_keys.append(key) metas.append(StringVariable("textable_text")) domain = Domain(attributes, [], metas) rows = list() for segment in self.segmentation: row = [ str(segment.annotations.get(annotation_key, None)) for annotation_key in attribute_keys ] row.extend([ str(segment.annotations.get(annotation_key, None)) for annotation_key in meta_keys ]) row.append(segment.get_content()) rows.append(row) progressBar.advance table = Table(domain, rows) if textMiningIsInstalled: corpus = Corpus(domain, X=table.X, metas=table.metas, text_features=[metas[-1]]) msg_corpus = u'%i document@p' % len(self.segmentation) msg_corpus = pluralize(msg_corpus, len(self.segmentation)) self.send('Text Mining corpus', corpus) else: self.send('Text Mining corpus', None) progressBar.finish() self.controlArea.setDisabled(False) if msg_seg or msg_corpus: message = msg_seg if msg_seg and msg_corpus: message += " and " message += msg_corpus message += " sent to output." self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()