def test_asserting_errors(self): c = Corpus.from_file('book-excerpts') too_large_x = np.vstack((c.X, c.X)) with self.assertRaises(ValueError): Corpus.from_numpy(c.domain, too_large_x, c.Y, c.metas, c.W, c.text_features) with self.assertRaises(ValueError): c.set_text_features([StringVariable('foobar')]) with self.assertRaises(ValueError): c.set_text_features([c.domain.metas[0], c.domain.metas[0]])
def main(): from Orange.data import Table, Domain, ContinuousVariable, StringVariable words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home' words = np.array([w.replace('~', ' ') for w in words.split()], dtype=object, ndmin=2).T weights = np.random.random((len(words), 1)) data = np.zeros((len(words), 0)) metas = [] for i, w in enumerate(weights.T): data = np.column_stack((data, words, w)) metas = metas + [StringVariable('Topic' + str(i)), ContinuousVariable('weights')] domain = Domain([], metas=metas) table = Table.from_numpy(domain, X=np.zeros((len(words), 0)), metas=data) app = QtGui.QApplication(['']) w = OWWordCloud() w.on_topics_change(table) domain = Domain([], metas=[StringVariable('text')]) data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]])) w.on_corpus_change(data) w.show() app.exec()
def test_set_title_from_domain(self): """ When we setup domain from data (e.g. from_numpy) _title variable must be set. """ domain = Domain([], metas=[StringVariable("title"), StringVariable("a")]) metas = [["title1", "a"], ["title2", "b"]] corpus = Corpus.from_numpy( domain, X=np.empty((2, 0)), metas=np.array(metas) ) assert_array_equal(["Document 1", "Document 2"], corpus.titles) domain["title"].attributes["title"] = True corpus = Corpus.from_numpy( domain, X=np.empty((2, 0)), metas=np.array(metas) ) assert_array_equal(["title1", "title2"], corpus.titles)
def test_titles_from_rows(self): domain = Domain([], metas=[StringVariable("title"), StringVariable("a")]) metas = [["title1", "a"], ["title2", "b"], ["titles3", "c"]] corpus = Corpus.from_numpy( domain, X=np.empty((3, 0)), metas=np.array(metas) ) corpus = Corpus.from_table_rows(corpus, [0, 2]) assert_array_equal(["Document 1", "Document 3"], corpus.titles)
def test_init_preserve_shape_of_empty_x(self): c = Corpus.from_file('book-excerpts') d = c.domain new_domain = Domain((ContinuousVariable('c1'), ), d.class_vars, d.metas) empty_X = csr_matrix((len(c), 1)) new = Corpus.from_numpy(new_domain, X=empty_X, Y=c.Y, metas=c.metas) self.assertEqual(empty_X.nnz, 0) self.assertEqual(new.X.shape, empty_X.shape)
def test_corpus_from_numpy(self): domain = Domain([], metas=[StringVariable("title"), StringVariable("a")]) corpus = Corpus.from_numpy(domain, np.empty((2, 0)), metas=np.array([["title1", "a"], ["title2", "b"]])) self.assertEqual(2, len(corpus)) assert_array_equal(["Document 1", "Document 2"], corpus.titles) self.assertListEqual([StringVariable("title")], corpus.text_features) self.assertIsNone(corpus._tokens) self.assertListEqual([], corpus.used_preprocessor.preprocessors)
def _create_corpus(self) -> Corpus: corpus = None names = ["name", "path", "content"] if not self.is_conllu else [ "name", "path", "utterance", "content" ] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: datum = [ # some characters are written as decomposed (č is char c # and separate char for caron), with NFC normalization we # normalize them to be written as precomposed (č is one # unicode char - 0x10D) # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize normalize('NFC', textdata.name), normalize('NFC', textdata.path), normalize('NFC', textdata.content) ] if self.is_conllu: datum.insert(2, normalize('NFC', textdata.doc_id)) data.append(datum) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain([], category_var, [StringVariable.make(name) for name in names]) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus.from_numpy(domain, X=np.empty((len(category_data), 0)), Y=category_data, metas=data, text_features=[domain.metas[-1]]) return corpus
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_vars = [] time_var = None for field_name, _ in includes_metadata: if field_name == PUBMED_FIELD_DATE: time_var = TimeVariable(field_name) meta_vars.append(time_var) else: meta_vars.append(StringVariable.make(field_name)) if field_name == PUBMED_FIELD_TITLE: meta_vars[-1].attributes["title"] = True meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata, time_var=time_var, ) class_vars = [ DiscreteVariable('section', values=list(map(str, set(filter(None, class_values))))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus.from_numpy(domain=domain, X=np.empty((len(Y), 0)), Y=Y, metas=meta_values)
def get_data(self): domain = Domain([], metas=[ StringVariable("Conc. {}".format(self.word)), StringVariable("Document") ]) data = [] docs = [] for row in range(self.rowCount()): txt = [] for column in range(self.columnCount()): index = self.index(row, column) txt.append(str(self.data(index))) data.append([" ".join(txt)]) docs.append([self.corpus.titles[self.word_index[row][0]]]) conc = (np.array(np.hstack( (data, docs)), dtype=object) if data else np.empty((0, 2))) return Corpus.from_numpy( domain, X=np.empty((len(conc), 0)), metas=conc, text_features=[domain.metas[0]], )
""" input: Corpus preprocessed with Preprocess Text. Tokenizer is set to Sentences. output: Corpus where sentences are now documents. requires: Text add-on """ import numpy as np from Orange.data import Domain, StringVariable from orangecontrib.text.corpus import Corpus tokens = in_data.tokens title = [i for i in in_data.domain.metas if "title" in i.attributes][0] new_domain = Domain(attributes=[], metas=[StringVariable('Sentences'), title) titles = [] content = [] for i, doc in enumerate(tokens): for t in doc: titles.append(in_data[i][title.name].value) content.append(t) metas = np.column_stack((content, titles)) out_data = Corpus.from_numpy(domain=new_domain, X=np.empty((len(content), 0)), metas=metas) out_data.set_text_features([StringVariable('Sentences')]) out_data.set_title_variable(title)