def test_init(self): corpus = Corpus.from_file('deerwester') c2n = CorpusToNetwork(corpus) self.assertIsInstance(c2n.corpus, Corpus) self.assertEqual(len(c2n.corpus), 9) with self.assertRaises(Exception): c2n = CorpusToNetwork(corpus.domain)
def test_titles_no_newline(self): corpus = Corpus.from_file("andersen") with corpus.unlocked(): corpus.metas[0, 0] = corpus.metas[0, 0] + "\ntest" corpus.set_title_variable("Title") self.send_signal(self.widget.Inputs.corpus, corpus) self.assertEqual("The Little Match-Seller test", self.widget.view.model().index(0, 0).data())
def main(): app = QApplication([]) widget = OWSentimentAnalysis() corpus = Corpus.from_file('book-excerpts') corpus = corpus[:3] widget.set_corpus(corpus) widget.show() app.exec()
def main(): app = QApplication([]) widget = OWSentimentAnalysis() corpus = Corpus.from_file('bookexcerpts') corpus = corpus[:3] widget.set_corpus(corpus) widget.show() app.exec()
def test_append_to_existing_X(self): """ Test if new features are correctly attached to X matrix """ data = Corpus.from_file("election-tweets-2016") self.send_signal(self.widget.Inputs.corpus, data) self.wait_until_finished() statistics = self.get_output(self.widget.Outputs.corpus) self.assertTupleEqual((data.X.shape[0], data.X.shape[1] + 2), statistics.X.shape)
def test_empty(self): corpus = Corpus.from_file('deerwester')[:0] c2n = CorpusToNetwork(corpus) result = c2n(document_nodes=True, window_size=1, threshold=1, freq_threshold=1) self.assertEqual(result.number_of_nodes(), 0) self.assertEqual(result.number_of_edges(), 0)
def test_corpus_not_normalized(self): # send non-normalized corpus non_normalized_corpus = Corpus.from_file("book-excerpts") self.send_signal(self.widget.Inputs.corpus, non_normalized_corpus) self.assertTrue(self.widget.Warning.corpus_not_normalized.is_shown()) # when sending normalized corpus error should disappear self.send_signal(self.widget.Inputs.corpus, self.corpus) self.assertFalse(self.widget.Warning.corpus_not_normalized.is_shown())
def setUp(self): self.patcher = patch( "orangecontrib.text.widgets.owsemanticviewer." "SemanticSearch", new=DummySearch) self.patcher.start() self.widget = self.create_widget(OWSemanticViewer) self.corpus = Corpus.from_file("deerwester") self.words = create_words_table(["foo", "graph", "minors", "trees"])
def test_output_unique(self): corpus = Corpus.from_file("book-excerpts") var = ContinuousVariable("Word count") corpus = corpus.add_column(var, np.array([1 for _ in range(len(corpus))])) words = create_words_table(["doctor", "rum", "house"]) self.send_signal(self.widget.Inputs.corpus, corpus) self.send_signal(self.widget.Inputs.words, words) self.wait_until_finished() output = self.get_output(self.widget.Outputs.selected_documents) self.assertTrue("Word count (1)" in output.domain)
def test_title_already_in_dataset(self): """ This dataset already have the title attribute so the title option is set to this attribute by default """ # default corpus dataset data = Corpus.from_file("election-tweets-2016") self.send_signal(self.widget.Inputs.data, data) self.assertEqual(data.domain["Content"], self.widget.title_variable) self.check_output("Content")
def test_cache(self): corpus = Corpus.from_file('deerwester') c2n = CorpusToNetwork(corpus) result1 = c2n(document_nodes=True, window_size=1, threshold=1, freq_threshold=1) result2 = c2n(document_nodes=True, window_size=1, threshold=1, freq_threshold=1) self.assertIs(result1, result2)
def test_call_word(self): corpus = Corpus.from_file('deerwester') c2n = CorpusToNetwork(corpus) result = c2n(document_nodes=False, window_size=1, threshold=1, freq_threshold=1) items = c2n.get_current_items(False) self.assertIsInstance(result, Network) self.assertIsInstance(items, Table) self.assertEqual(len(items), result.number_of_nodes()) self.assertGreater(result.number_of_nodes(), len(corpus))
def main(): corpus = Corpus.from_file('book-excerpts') vect = BowVectorizer() corpus_vect = vect.transform(corpus) app = QApplication([]) widget = OWWordEnrichment() widget.set_data(corpus_vect) subset_corpus = corpus_vect[:10] widget.set_data_selected(subset_corpus) widget.handleNewSignals() widget.show() app.exec()
def test_output_status(self): """ Test input, output info """ # when input signal data = Corpus.from_file("election-tweets-2016") out_sum = self.widget.info.set_output_summary = Mock() self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)\n" "Classification; discrete class with 2 values.") out_sum.reset_mock() # corpus without class data1 = Corpus(Domain(data.domain.attributes, metas=data.domain.metas), data.X, metas=data.metas, text_features=data.text_features) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)") out_sum.reset_mock() # corpus with continuous class data1 = Corpus(Domain(data.domain.attributes, ContinuousVariable("a"), metas=data.domain.metas), data.X, np.random.rand(len(data), 1), metas=data.metas, text_features=data.text_features) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)\n" "Regression; numerical class.") out_sum.reset_mock() # default dataset is on the output self.send_signal(self.widget.Inputs.data, None) self.wait_until_finished() out_sum.assert_called_with( "140", "140 document(s)\n1 text features(s)\n0 other feature(s)\n" "Classification; discrete class with 2 values.") out_sum.reset_mock()
def test_hide_attributes(self): self.send_signal("语料库(Corpus)", self.corpus) self.assertTrue( all(f.attributes['hidden'] for f in self.get_output("语料库(Corpus)").domain.attributes)) self.widget.controls.hidden_cb.setChecked(False) self.assertFalse( any(f.attributes['hidden'] for f in self.get_output("语料库(Corpus)").domain.attributes)) new_corpus = Corpus.from_file('book-excerpts')[:10] self.send_signal("语料库(Corpus)", new_corpus) self.assertFalse( any(f.attributes['hidden'] for f in self.get_output("语料库(Corpus)").domain.attributes))
def setUp(self) -> None: self.widget: OWScoreDocuments = self.create_widget(OWScoreDocuments) # create corpus self.corpus = Corpus.from_file("book-excerpts") pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.SnowballStemmer(), ] for p in pp_list: self.corpus = p(self.corpus) # create words table words = ["house", "doctor", "boy", "way", "Rum"] self.words = self.create_words_table(words)
def test_input_status(self): """ Test input, output info """ data = Corpus.from_file("election-tweets-2016") input_sum = self.widget.info.set_input_summary = Mock() self.send_signal(self.widget.Inputs.data, data) input_sum.assert_called_with(str(len(data)), f"{len(data)} data instances on input") input_sum.reset_mock() self.send_signal(self.widget.Inputs.data, data[:1]) input_sum.assert_called_with("1", "1 data instance on input") input_sum.reset_mock() self.send_signal(self.widget.Inputs.data, None) input_sum.assert_called_with(self.widget.info.NoInput) input_sum.reset_mock()
def test_no_text_feature(self): """ Test with data which have empty text_features. Widget should not show the error but, should have all features unused. """ # widget already loads book-excerpts from file and store context # settings this call restore context settings to default otherwise # Text variable is moved to used_attributes by the context self.widget.settingsHandler.reset_to_original(self.widget) data = Corpus.from_file("book-excerpts") data.text_features = [] self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() self.assertFalse( self.widget.Error.corpus_without_text_features.is_shown()) self.assertEqual(0, len(list(self.widget.used_attrs_model))) self.assertListEqual([data.domain["Text"]], list(self.widget.unused_attrs_model))
def test_params(self): corpus = Corpus.from_file('deerwester') c2n = CorpusToNetwork(corpus) result1 = c2n(document_nodes=False, window_size=1, threshold=1, freq_threshold=1) result2 = c2n(document_nodes=False, window_size=1, threshold=100, freq_threshold=1) self.assertGreater(result1.number_of_edges(), result2.number_of_edges()) result2 = c2n(document_nodes=False, window_size=10, threshold=1, freq_threshold=1) self.assertLess(result1.number_of_edges(), result2.number_of_edges()) result2 = c2n(document_nodes=False, window_size=1, threshold=1, freq_threshold=100) self.assertGreater(result1.number_of_nodes(), result2.number_of_nodes())
def test_preprocess_words(self): corpus = Corpus.from_file("book-excerpts") words = [ "House", "dóctor", "boy", "way", "Rum https://google.com", "https://google.com", "<p>abra<b>cadabra</b><p>", ] pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer(), ] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["house", "doctor", "boy", "way", "rum", "abracadabra"], _preprocess_words(corpus, words, dummy_callback), ) words = ["House", "dóctor", "boys", "way", "Rum"] pp_list = [preprocess.SnowballStemmer()] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["hous", "doctor", "boy", "way", "rum"], _preprocess_words(corpus, words, dummy_callback), )
def setUp(self): self.sbert = SBERT() self.sbert.clear_cache() self.corpus = Corpus.from_file('deerwester')
def setUp(self) -> None: self.widget = self.create_widget(OWStatistics) self.book_data = Corpus.from_file("book-excerpts") self._create_simple_data()
self.__exit__(None, None, None) class _ServerEmbedder(ServerEmbedderCommunicator): def __init__(self, aggregator: str, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.content_type = 'application/json' self.aggregator = aggregator async def _encode_data_instance(self, data_instance: Any) -> Optional[bytes]: data_string = json.dumps(list(data_instance)) data = base64.b64encode( zlib.compress(data_string.encode('utf-8', 'replace'), level=-1)).decode('utf-8', 'replace') if sys.getsizeof(data) > 50000: # Document in corpus is too large. Size limit is 50 KB # (after compression). - document skipped return None data_dict = {"data": data, "aggregator": self.aggregator} json_string = json.dumps(data_dict) return json_string.encode('utf-8', 'replace') if __name__ == '__main__': with DocumentEmbedder(language='en', aggregator='Max') as embedder: embedder.clear_cache() embedder(Corpus.from_file('deerwester'))
name = 'Vader' @wait_nltk_data def __init__(self): self.vader = SentimentIntensityAnalyzer() def transform(self, corpus, copy=True): scores = [] for text in corpus.documents: pol_sc = self.vader.polarity_scores(text) scores.append([pol_sc[x] for x in self.sentiments]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [ VectorizationComputeValue(shared_cv, col) for col in self.sentiments ] if copy: corpus = corpus.copy() corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus if __name__ == "__main__": corpus = Corpus.from_file('deerwester') liu = Liu_Hu_Sentiment('Slovenian') corpus2 = liu.transform(corpus[:5])
sort_column, reverse = self.sort_column_order data = sorted(model, key=lambda a: a[sort_column], reverse=reverse) data = [s[1:] + s[:1] for s in data if s[0] in self.selected_words] words = Table.from_list(domain, data) words.name = "Words" self.Outputs.words.send(words) def send_report(self): if not self.corpus: return self.report_data("Corpus", self.corpus) if self.words is not None: self.report_paragraph("Words", ", ".join(self.words)) self.report_table("Keywords", self.view, num_format="{:.3f}") if __name__ == "__main__": # pylint: disable=ungrouped-imports from Orange.widgets.utils.widgetpreview import WidgetPreview words_var_ = StringVariable(WORDS_COLUMN_NAME) words_var_.attributes = {"type": "words"} lists = [[w] for w in ["human", "graph", "minors", "trees"]] words_ = Table.from_list(Domain([], metas=[words_var_]), lists) words_.name = "Words" WidgetPreview(OWKeywords).run( set_corpus=Corpus.from_file("deerwester"), # deerwester book-excerpts # set_words=words_ )
def setUp(self): self.widget = self.create_widget(OWCorpusToNetwork) self.corpus = Corpus.from_file('deerwester') self.larger_corpus = Corpus.from_file('book-excerpts')
attributes = [] comput_values = [] not_computed = [] for rule in self.applied_rules: # check for safety reasons - in practice should not happen if rule in self.result_dict: res = self.result_dict[rule] if res is None: not_computed.append(STATISTICS_NAMES[rule[0]]) else: data, variables, comp_value = res to_stack.append(data) attributes += variables comput_values.append(comp_value) if not_computed: self.Warning.not_computed(", ".join(not_computed)) new_corpus = self.corpus.extend_attributes( np.hstack(to_stack) if to_stack else np.empty((len(self.corpus), 0)), attributes, compute_values=comput_values ) self.Outputs.corpus.send(new_corpus) # summary self.info.set_output_summary( len(new_corpus), format_summary_details(new_corpus) ) if __name__ == "__main__": WidgetPreview(OWStatistics).run(Corpus.from_file("book-excerpts"))
self.cancel() self._invalidated = True def onDeleteWidget(self): self.shutdown() super().onDeleteWidget() if __name__ == "__main__": from Orange.projection import PCA from Orange.widgets.utils.widgetpreview import WidgetPreview from orangecontrib.text.preprocess import LowercaseTransformer, \ RegexpTokenizer, StopwordsFilter, FrequencyFilter from orangecontrib.text.vectorization import BowVectorizer corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), StopwordsFilter("English"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) pca = PCA(n_components=2) pca_model = pca(transformed_corpus) projection = pca_model(transformed_corpus) domain_ = Domain( transformed_corpus.domain.attributes, transformed_corpus.domain.class_vars, chain(transformed_corpus.domain.metas, projection.domain.attributes)) corpus_ = corpus_.transform(domain_)
def on_done(self, result: Any) -> None: self._task_state = "waiting" self.button.setText("Start") network = result[0] self._send_output_signals(result) nodes = network.number_of_nodes() edges = network.number_of_edges() summary = "{} / {}".format(nodes, edges) directed = "Directed" if network.edges[0].directed else "Undirected" details = "{} network with {} nodes and {} edges.".format( directed, nodes, edges) self.info.set_output_summary(summary, details) def on_partial_result(self, result: Any): self.cancel() def on_exception(self, ex: Exception): self.Error.unexpected_error(type(ex).__name__) self.cancel() def onDeleteWidget(self): del self._corpus_to_network super().onDeleteWidget() if __name__ == "__main__": from orangewidget.utils.widgetpreview import WidgetPreview WidgetPreview(OWCorpusToNetwork).run(Corpus.from_file("book-excerpts"))
def setUp(self): self.widget = self.create_widget(TestableBaseVectWidget) self.corpus = Corpus.from_file('deerwester')
# POS tagger if "pos_tagger" in settings: pos_tagger = settings.pop("pos_tagger") if pos_tagger["enabled"]: params = {"method": pos_tagger["method_index"]} preprocessors.append(("tag.pos", params)) settings["storedsettings"]["preprocessors"] = preprocessors if version < 3: preprocessors = settings["storedsettings"]["preprocessors"] for pp_name, pp_settings in preprocessors: if pp_name == "preprocess.filter": start = pp_settings["start"] end = pp_settings["end"] if end <= 1: pp_settings["rel_start"] = start pp_settings["rel_end"] = end else: pp_settings["abs_start"] = start pp_settings["abs_end"] = end del pp_settings["start"] del pp_settings["end"] if __name__ == "__main__": from Orange.widgets.utils.widgetpreview import WidgetPreview WidgetPreview(OWPreprocess).run(set_data=Corpus.from_file("deerwester"))
class Vader_Sentiment: sentiments = ('pos', 'neg', 'neu', 'compound') name = 'Vader' @wait_nltk_data def __init__(self): self.vader = SentimentIntensityAnalyzer() def transform(self, corpus, copy=True): scores = [] for text in corpus.documents: pol_sc = self.vader.polarity_scores(text) scores.append([pol_sc[x] for x in self.sentiments]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [VectorizationComputeValue(shared_cv, col) for col in self.sentiments] if copy: corpus = corpus.copy() corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus if __name__ == "__main__": corpus = Corpus.from_file('deerwester') liu = Liu_Hu_Sentiment('Slovenian') corpus2 = liu.transform(corpus[:5])