def open_file(self, path=None, data=None): self.closeContext() self.Error.clear() self.unused_attrs_model[:] = [] self.used_attrs_model[:] = [] if data: self.corpus = Corpus.from_table(data.domain, data) elif path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] except BaseException as err: self.Error.read_file(path, str(err)) else: return self.update_info() self.used_attrs = list(self.corpus.text_features) if not self.corpus.text_features: self.Error.corpus_without_text_features() self.Outputs.corpus.send(None) return self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend( [f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model])
def test_compute_values(self): corpus = Corpus.from_file('deerwester') vect = BowVectorizer() bow = vect.transform(corpus) computed = Corpus.from_table(bow.domain, corpus) self.assertEqual(bow.domain, computed.domain) self.assertEqual((bow.X != computed.X).nnz, 0)
def test_infer_text_features(self): c = Corpus.from_file('friends-transcripts') tf = c.text_features self.assertEqual(len(tf), 1) self.assertEqual(tf[0].name, 'Quote') c = Corpus.from_file('deerwester') tf = c.text_features self.assertEqual(len(tf), 1) self.assertEqual(tf[0].name, 'Text')
def test_compute_values_to_different_domain(self): source = Corpus.from_file('deerwester') destination = Corpus.from_file('book-excerpts') self.assertFalse(source.domain.attributes) self.assertFalse(destination.domain.attributes) bow = BowVectorizer().transform(source) computed = destination.transform(bow.domain) self.assertEqual(bow.domain.attributes, computed.domain.attributes)
def test_corpus_from_file(self): c = Corpus.from_file('book-excerpts') self.assertEqual(len(c), 140) self.assertEqual(len(c.domain), 1) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (140, 1)) c = Corpus.from_file('deerwester') self.assertEqual(len(c), 9) self.assertEqual(len(c.domain), 1) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (9, 1))
def main(): from Orange.data import Table, Domain, ContinuousVariable, StringVariable words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home' words = np.array([w.replace('~', ' ') for w in words.split()], dtype=object, ndmin=2).T weights = np.random.random((len(words), 1)) data = np.zeros((len(words), 0)) metas = [] for i, w in enumerate(weights.T): data = np.column_stack((data, words, w)) metas = metas + [StringVariable('Topic' + str(i)), ContinuousVariable('weights')] domain = Domain([], metas=metas) table = Table.from_numpy(domain, X=np.zeros((len(words), 0)), metas=data) app = QtGui.QApplication(['']) w = OWWordCloud() w.on_topics_change(table) domain = Domain([], metas=[StringVariable('text')]) data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]])) w.on_corpus_change(data) w.show() app.exec()
def test_transform(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertIsInstance(result, Corpus) self.assertEqual(len(result.domain), 43)
def test_corpus_from_file(self): c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.txt')) self.assertEqual(len(c), 140) self.assertEqual(len(c.domain), 0) self.assertEqual(len(c.domain.metas), 2) self.assertEqual(c.metas.shape, (140, 2))
def test_corpus_from_file_just_text(self): c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.tab')) self.assertEqual(len(c), 9) self.assertEqual(len(c.domain), 0) self.assertEqual(len(c.domain.metas), 1) self.assertEqual(c.metas.shape, (9, 1))
def test_documents(self): c = Corpus.from_file('book-excerpts') docs = c.documents types = set(type(i) for i in docs) self.assertEqual(len(docs), len(c)) self.assertEqual(len(types), 1) self.assertIn(str, types)
def test_corpus_not_eq(self): c = Corpus.from_file('book-excerpts') n_doc = c.X.shape[0] c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, []) self.assertNotEqual(c, c2) c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features) self.assertNotEqual(c, c2) c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features) self.assertNotEqual(c, c2) broken_metas = np.copy(c.metas) broken_metas[0, 0] = '' c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features) self.assertNotEqual(c, c2) new_meta = [StringVariable('text2')] broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta) c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta) self.assertNotEqual(c, c2) c2 = c.copy() c2.ngram_range = (2, 4) self.assertNotEqual(c, c2)
def test_create_bow(self): corpus = Corpus.from_file('deerwester') bag_of_words = self.bow(corpus, use_tfidf=True) self.assertIsNotNone(bag_of_words.X) self.assertEqual(9, bag_of_words.X.shape[0]) self.assertEqual(42, bag_of_words.X.shape[1]) self.assertEqual(self.progress_callbacks, 4) self.assertEqual(self.error_callbacks, 0)
def test_POSTagger(self): corpus = Corpus.from_file('deerwester') tagger = tag.AveragedPerceptronTagger() result = tagger.tag_corpus(corpus) self.assertTrue(hasattr(result, 'pos_tags')) # for token in itertools.chain(*result.tokens): # self.assertRegexpMatches(token, '[a-z]+_[A-Z]+') for tokens, tags in zip(result.tokens, result.pos_tags): self.assertEqual(len(tokens), len(tags))
def set_data(self, data=None): self.reset_widget() self.corpus = data if data is not None: if not isinstance(data, Corpus): self.corpus = Corpus.from_table(data.domain, data) self.load_features() self.regenerate_docs() self.commit()
def test_from_table(self): t = Table.from_file('brown-selected') self.assertIsInstance(t, Table) c = Corpus.from_table(t.domain, t) self.assertIsInstance(c, Corpus) self.assertEqual(len(t), len(c)) np.testing.assert_equal(t.metas, c.metas) self.assertEqual(c.text_features, [t.domain.metas[0]])
def test_empty_corpus(self): """ Empty data. GH-247 """ corpus = Corpus.from_file("deerwester")[:0] vect = BowVectorizer(norm=BowVectorizer.L1) out = vect.transform(corpus) self.assertEqual(out, corpus)
def on_data(self, data): if data and not isinstance(data, Corpus): data = Corpus.from_table(data.domain, data) self.data = data self._repopulate_attr_combo(data) if not data: self.region_selected('') QTimer.singleShot(0, lambda: self.webview.evalJS('DATA = {}; renderMap();')) else: QTimer.singleShot(0, self.on_attr_change)
def test_init_preserve_shape_of_empty_x(self): c = Corpus.from_file('book-excerpts') d = c.domain new_domain = Domain((ContinuousVariable('c1'),), d.class_vars, d.metas) empty_X = csr_matrix((len(c), 1)) new = Corpus(new_domain, X=empty_X, Y=c.Y, metas=c.metas) self.assertEqual(empty_X.nnz, 0) self.assertEqual(new.X.shape, empty_X.shape)
def set_data(self, data=None): self.reset_widget() # Clear any old data. if data is not None: self.corpus = data if isinstance(data, Table): self.corpus = Corpus.from_table(data.domain, data) self.load_features() self.regenerate_documents() # Send the corpus to output. self.send(Output.CORPUS, self.corpus)
def test_compute_values_to_different_domain(self): destination = Corpus.from_file('andersen') self.assertFalse(self.corpus.domain.attributes) self.assertFalse(destination.domain.attributes) sentiment = self.method.transform(self.corpus) computed = destination.transform(sentiment.domain) self.assertTrue(sentiment.domain.attributes) self.assertEqual(sentiment.domain.attributes, computed.domain.attributes)
def test_documents_from_features(self): c = Corpus.from_file('book-excerpts') docs = c.documents_from_features([c.domain.class_var]) types = set(type(i) for i in docs) self.assertTrue(all( [sum(cls in doc for cls in c.domain.class_var.values) == 1 for doc in docs])) self.assertEqual(len(docs), len(c)) self.assertEqual(len(types), 1) self.assertIn(str, types)
def test_ngrams(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'), ngrams_range=(1, 3)) pr(corpus, inplace=True) result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertIn(corpus.tokens[0][1], attrs) self.assertIn(' '.join(corpus.tokens[0][:2]), attrs) self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
def test_corpus_remove_text_features(self): """ Remove those text features which do not have a column in metas. GH-324 GH-325 """ c = Corpus.from_file('deerwester') domain = Domain(attributes=c.domain.attributes, class_vars=c.domain.class_vars) d = c.transform(domain) self.assertFalse(len(d.text_features)) # Make sure that copying works. d.copy()
def search(self, query, date_from=None, date_to=None, max_docs=None, on_progress=None, should_break=None): """ Args: query (str): Search query. date_from (date): Start date limit. date_to (date): End date limit. max_docs (int): Maximal number of documents returned. on_progress (callback): Called after every iteration of downloading. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. Returns: Corpus: Search results. """ if not self.api_key_valid(): raise RuntimeError('The API key is not valid.') if max_docs is None or max_docs > MAX_DOCS: max_docs = MAX_DOCS # TODO create corpus on the fly and extend, so it stops faster. records = [] data, cached = self._fetch_page(query, date_from, date_to, 0) if data is None: return None records.extend(data['response']['docs']) max_docs = min(data['response']['meta']['hits'], max_docs) if callable(on_progress): on_progress(len(records), max_docs) for page in range(1, math.ceil(max_docs/BATCH_SIZE)): if callable(should_break) and should_break(): break data, cached = self._fetch_page(query, date_from, date_to, page) if data is None: break records.extend(data['response']['docs']) if callable(on_progress): on_progress(len(records), max_docs) if not cached: sleep(SLEEP) if len(records) > max_docs: records = records[:max_docs] return Corpus.from_documents(records, 'NY Times', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def test_corpus_not_eq(self): c = Corpus.from_file('bookexcerpts') c2 = Corpus(c.documents[:-1], c.X, c.Y, c.metas, c.domain) self.assertNotEqual(c, c2) c2 = Corpus(c.documents, np.vstack((c.X, c.X)), c.Y, c.metas, c.domain) self.assertNotEqual(c, c2) c2 = Corpus(c.documents, c.X, np.vstack((c.Y, c.Y)), c.metas, c.domain) self.assertNotEqual(c, c2) c2 = Corpus(c.documents, c.X, c.Y, c.metas.T, c.domain) self.assertNotEqual(c, c2) broken_domain = Domain(c.domain.attributes, c.domain.class_var, [StringVariable('text2')]) c2 = Corpus(c.documents, c.X, c.Y, c.metas, broken_domain) self.assertNotEqual(c, c2)
def test_copy(self): corpus = Corpus.from_file('deerwester') p = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+\s}')) copied = corpus.copy() p(copied, inplace=True) self.assertIsNot(copied, corpus) self.assertNotEqual(copied, corpus) p(corpus, inplace=True) copied = corpus.copy() self.assertIsNot(copied, corpus) self.assertEqual(copied, corpus)
def test_extend_corpus(self): c = Corpus.from_file('book-excerpts') n_classes = len(c.domain.class_var.values) c_copy = c.copy() new_y = [c.domain.class_var.values[int(i)] for i in c.Y] new_y[0] = 'teenager' c.extend_corpus(c.metas, new_y) self.assertEqual(len(c), len(c_copy)*2) self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2) self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2) self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1]) self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
def test_domain(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertEqual(attrs, sorted(attrs)) X = result.X.toarray() for i in range(len(corpus)): for contains, attr in zip(X[i], attrs): if contains > .001: self.assertIn(attr, corpus.tokens[i])
def test_extend_corpus(self): c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.tab')) n_classes = len(c.domain.class_var.values) c_copy = c.copy() new_y = [c.domain.class_var.values[int(i)] for i in c.Y] new_y[0] = 'teenager' c.extend_corpus(c.documents, c.metas, new_y) self.assertEqual(len(c), len(c_copy)*2) self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2) self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2) self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1]) self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
def open_file(self, path): self.Error.read_file.clear() self.used_attrs[:] = [] self.unused_attrs[:] = [] if path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] self.info_label.setText("Corpus of {} documents.".format(len(self.corpus))) self.used_attrs.extend(self.corpus.text_features) self.unused_attrs.extend([f for f in self.corpus.domain.metas if f.is_string and f not in self.corpus.text_features]) except BaseException as err: self.Error.read_file(path, str(err))
def test_set_text_features(self): c = Corpus.from_file('friends-transcripts')[:100] c2 = c.copy() self.assertEqual(c.set_text_features(None), c2._infer_text_features())
def test_empty_corpus(self): corpus = Corpus.from_file('deerwester')[:0] sentiment = self.method.transform(corpus) self.assertEqual(len(sentiment.domain), len(self.corpus.domain) + self.new_cols) self.assertEqual(len(sentiment), 0)
def test_has_tokens(self): corpus = Corpus.from_file('deerwester') self.assertFalse(corpus.has_tokens()) corpus.store_tokens(corpus.tokens) # default tokenizer self.assertTrue(corpus.has_tokens())
self.profiler.new_token() self.token = self.profiler.token self.refresh_token_info() self.commit() def token_changed(self): self.profiler.token = self.token self.refresh_token_info() self.commit() def refresh_token_info(self): self.credit = str(self.profiler.get_credit()) def send_report(self): self.report_items([ ('Documents', self.n_documents), ('Attribute', self.strings_attrs[self.tweet_attr] if len(self.strings_attrs) > self.tweet_attr else ''), ('Emotions', self.model_name), ('Output', self.output_mode), ]) if __name__ == '__main__': app = QtGui.QApplication([]) corpus = Corpus.from_file('Election-2016-Tweets.tab') widget = OWTweetProfiler() widget.set_corpus(corpus[:100]) widget.show() app.exec()
def setUp(self): self.widget = self.create_widget(OWTBagOfWords) self.corpus = Corpus.from_file('deerwester')
""" input: Corpus preprocessed with Preprocess Text. Tokenizer is set to Sentences. output: Corpus where sentences are now documents. requires: Text add-on """ import numpy as np from Orange.data import Domain, StringVariable from orangecontrib.text.corpus import Corpus tokens = in_data.tokens title = [i for i in in_data.domain.metas if "title" in i.attributes][0] new_domain = Domain(attributes=[], metas=[StringVariable('Sentences'), title) titles = [] content = [] for i, doc in enumerate(tokens): for t in doc: titles.append(in_data[i][title.name].value) content.append(t) metas = np.column_stack((content, titles)) out_data = Corpus.from_numpy(domain=new_domain, X=np.empty((len(content), 0)), metas=metas) out_data.set_text_features([StringVariable('Sentences')]) out_data.set_title_variable(title)
def setUp(self): self.corpus = Corpus.from_file('deerwester') self.method = SentiArt() self.new_cols = 7
def test_extend_corpus_non_empty_X(self): c = Corpus.from_file('election-tweets-2016')[:10] with self.assertRaises(ValueError): c.extend_corpus(c.metas, c.Y)
def setUp(self): self.corpus = Corpus.from_file("deerwester") self.widget = self.create_widget(OWTopicModeling)
f"{cc_len}" input_string = (f"{cor_output_len or 0} documents\n" f"{n_selected or 0} selected words\n" f"{cc_len} words with counts") self.info.set_output_summary(input_numbers, input_string) def send_report(self): if self.webview: html = self.webview.html() start = html.index(">", html.index("<body")) + 1 end = html.index("</body>") body = html[start:end] # create an empty div of appropriate height to compensate for # absolute positioning of words in the html height = self.webview._evalJS( "document.getElementById('canvas').clientHeight") self.report_html += "<div style='position: relative; height: " \ f"{height}px;'>{body}</div>" self.report_table(self.tableview) def sizeHint(self) -> QtCore.QSize: return super().sizeHint().expandedTo(QSize(900, 500)) if __name__ == "__main__": from orangewidget.utils.widgetpreview import WidgetPreview corpus = Corpus.from_file("book-excerpts") WidgetPreview(OWWordCloud).run(corpus)
def setUp(self): self.widget = self.create_widget(OWConcordance) # type: OWConcordance self.corpus = Corpus.from_file('deerwester')
def setUp(self): self.widget = self.create_widget(OWSentimentAnalysis) self.corpus = Corpus.from_file('deerwester')
("Matching documents", self.n_matching), ("Matches", self.n_matches))) def showEvent(self, event): super().showEvent(event) self.update_splitter() def update_splitter(self): """ Update splitter that document list on the left never take more than 1/3 of the space. It is only set on showEvent. If user later changes sizes it stays as it is. """ w1, w2 = self.splitter.sizes() ws = w1 + w2 if w2 < 2 / 3 * ws: self.splitter.setSizes([int(ws * 1 / 3), int(ws * 2 / 3)]) if __name__ == '__main__': from orangecontrib.text.preprocess import BASE_TOKENIZER from orangecontrib.text.tag.pos import AveragedPerceptronTagger from orangewidget.utils.widgetpreview import WidgetPreview corpus = Corpus.from_file('book-excerpts') corpus = corpus[:3] tagger = AveragedPerceptronTagger() tagged_corpus = tagger(BASE_TOKENIZER(corpus)) tagged_corpus.ngram_range = (1, 2) WidgetPreview(OWCorpusViewer).run(tagged_corpus)
def setUp(self): self.corpus = Corpus.from_file('deerwester')
def setUp(self): self.widget = self.create_widget(OWWordCloud) self.corpus = Corpus.from_file('deerwester') self.topic = self.create_topic()
def test_binary(self): vect = BowVectorizer(wlocal=BowVectorizer.BINARY) corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertEqual(result.X.max(), 1.)
def test_compute_values(self): sentiment = self.method.transform(self.corpus) computed = Corpus.from_table(sentiment.domain, self.corpus) self.assertEqual(sentiment.domain, computed.domain) self.assertTrue((sentiment.X == computed.X).all())
QtGui.QPalette.HighlightedText)) textRect = style.subElementRect(QStyle.SE_ItemViewItemText, options) painter.save() painter.translate(textRect.topLeft()) painter.setClipRect(textRect.translated(-textRect.topLeft())) doc.documentLayout().draw(painter, ctx) painter.restore() def sizeHint(self, option, index): options = QStyleOptionViewItem(option) self.initStyleOption(options, index) doc = QtGui.QTextDocument() doc.setHtml(options.text) doc.setTextWidth(options.rect.width()) return QtCore.QSize(doc.idealWidth(), doc.size().height()) if __name__ == '__main__': from AnyQt.QtWidgets import QApplication app = QApplication([]) widget = OWTopicModeling() # widget.set_data(Corpus.from_file('book-excerpts')) widget.set_data(Corpus.from_file('deerwester')) widget.show() app.exec() widget.saveSettings()
def test_empty_tokens(self): corpus = Corpus.from_file('deerwester') corpus.text_features = [] bag_of_words = BowVectorizer().transform(corpus, copy=False) self.assertIs(corpus, bag_of_words)
def setUp(self): self.corpus = Corpus.from_file('deerwester') self.method = MultiSentiment() self.new_cols = 1
def test_corpus_from_file_with_tab(self): c = Corpus.from_file('book-excerpts') c2 = Corpus.from_file('book-excerpts.tab') self.assertEqual(c, c2)
def setUp(self): self.corpus = Corpus.from_file('deerwester') self.method = LiuHuSentiment('English') self.new_cols = 1
def test_corpus_from_file_abs_path(self): c = Corpus.from_file('book-excerpts') path = os.path.dirname(__file__) file = os.path.abspath(os.path.join(path, '..', 'datasets', 'book-excerpts.tab')) c2 = Corpus.from_file(file) self.assertEqual(c, c2)
def test_corpus_from_init(self): c = Corpus.from_file('book-excerpts') c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.text_features) self.assertEqual(c, c2)
def create_configuration_layout(self): layout = QFormLayout() spin = gui.spin(self, self, "f", minv=8, maxv=SimhashVectorizer.max_f, step=8) spin.editingFinished.connect(self.f_spin_changed) layout.addRow('Simhash size:', spin) spin = gui.spin(self, self, 'shingle_len', minv=1, maxv=100) spin.editingFinished.connect(self.on_change) layout.addRow('Shingle length:', spin) return layout def init_method(self): return self.Method(shingle_len=self.shingle_len, f=self.f) def f_spin_changed(self): # simhash needs f value to be multiple of 8, correct if it is not self.f = 8 * round(self.f / 8) self.on_change() if __name__ == '__main__': from orangewidget.utils.widgetpreview import WidgetPreview WidgetPreview(OWSimhash).run(Corpus.from_file("book-excerpts"))
def setUp(self): self.widget = self.create_widget(OWCorpusViewer) self.corpus = Corpus.from_file('deerwester')
def test_corpus_from_file_missing(self): with self.assertRaises(FileNotFoundError): Corpus.from_file('missing_file')
def sendData(self): """Convert input(s) and send output""" if not (self.segmentation or self.corpus): self.infoBox.setText(u'Widget needs input.', 'warning') self.send('Textable segmentation', None, self) self.send('Text Mining corpus', None) return msg_seg = msg_corpus = "" num_iterations = 0 if self.corpus: num_iterations += len(self.corpus) if self.segmentation: num_iterations += len(self.segmentation) self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=num_iterations) # Convert corpus to segmentation... if self.corpus: self.clearCreatedInputs() new_segments = list() text_feature = self.corpus.text_features[self.segmentContent] for row in self.corpus: content = row[text_feature].value if content == "": continue new_input = Input(row[text_feature].value) new_segment_annotations = dict() for attr in self.corpus.domain: attr_str = str(row[attr]) if attr_str != "?": new_segment_annotations[str(attr)] = attr_str for meta_attr in self.corpus.domain.metas: meta_attr_str = str(row[meta_attr]) if (meta_attr != text_feature and meta_attr_str != "?"): new_segment_annotations[str(meta_attr)] = meta_attr_str new_segments.append( Segment(new_input[0].str_index, new_input[0].start, new_input[0].end, new_segment_annotations)) self.createdInputs.append(new_input) progressBar.advance() new_segmentation = Segmentation(new_segments, self.captionTitle) msg_seg = u'%i segment@p' % len(new_segmentation) msg_seg = pluralize(msg_seg, len(new_segmentation)) self.send('Textable segmentation', new_segmentation) else: self.send('Textable segmentation', None) # Convert segmentation to corpus... if self.segmentation: metas = list() attributes = list() meta_keys = list() attribute_keys = list() for key in self.segmentation.get_annotation_keys(): possible_values = set() for segment in self.segmentation: try: possible_values.add(str(segment.annotations[key])) except KeyError: pass if (self.limitNumCategories and len(possible_values) > self.maxNumCategories): metas.append(StringVariable(key)) meta_keys.append(key) else: attributes.append( DiscreteVariable(key, values=list(possible_values))) attribute_keys.append(key) metas.append(StringVariable("textable_text")) domain = Domain(attributes, [], metas) rows = list() for segment in self.segmentation: row = [ str(segment.annotations.get(annotation_key, None)) for annotation_key in attribute_keys ] row.extend([ str(segment.annotations.get(annotation_key, None)) for annotation_key in meta_keys ]) row.append(segment.get_content()) rows.append(row) progressBar.advance table = Table(domain, rows) if textMiningIsInstalled: corpus = Corpus(domain, X=table.X, metas=table.metas, text_features=[metas[-1]]) msg_corpus = u'%i document@p' % len(self.segmentation) msg_corpus = pluralize(msg_corpus, len(self.segmentation)) self.send('Text Mining corpus', corpus) else: self.send('Text Mining corpus', None) progressBar.finish() self.controlArea.setDisabled(False) if msg_seg or msg_corpus: message = msg_seg if msg_seg and msg_corpus: message += " and " message += msg_corpus message += " sent to output." self.infoBox.setText(message) self.sendButton.resetSettingsChangedFlag()
def setUp(self): self.corpus = Corpus.from_file('slo-opinion-corpus') self.method = LiuHuSentiment('Slovenian') self.new_cols = 1