def test_output_status(self): """ Test input, output info """ # when input signal data = Corpus.from_file("election-tweets-2016") out_sum = self.widget.info.set_output_summary = Mock() self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)\n" "Classification; discrete class with 2 values.") out_sum.reset_mock() # corpus without class data1 = Corpus(Domain(data.domain.attributes, metas=data.domain.metas), data.X, metas=data.metas, text_features=data.text_features) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)") out_sum.reset_mock() # corpus with continuous class data1 = Corpus(Domain(data.domain.attributes, ContinuousVariable("a"), metas=data.domain.metas), data.X, np.random.rand(len(data), 1), metas=data.metas, text_features=data.text_features) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)\n" "Regression; numerical class.") out_sum.reset_mock() # default dataset is on the output self.send_signal(self.widget.Inputs.data, None) self.wait_until_finished() out_sum.assert_called_with( "140", "140 document(s)\n1 text features(s)\n0 other feature(s)\n" "Classification; discrete class with 2 values.") out_sum.reset_mock()
def _send_output(self) -> None: """ Create corpus with scores and output it """ if self.corpus is None: self.Outputs.corpus.send(None) self.Outputs.selected_documents.send(None) return scores, labels = self._gather_scores() if labels: d = self.corpus.domain domain = Domain( d.attributes, d.class_var, metas=d.metas + tuple( ContinuousVariable(get_unique_names(d, l)) for l in labels), ) out_corpus = Corpus( domain, self.corpus.X, self.corpus.Y, np.hstack([self.corpus.metas, scores]), ) Corpus.retain_preprocessing(self.corpus, out_corpus) else: out_corpus = self.corpus self.Outputs.corpus.send( create_annotated_table(out_corpus, self.selected_rows)) self.Outputs.selected_documents.send( out_corpus[self.selected_rows] if self.selected_rows else None)
def _preprocess_words(corpus: Corpus, words: List[str], callback: Callable) -> List[str]: """ Corpus's tokens can be preprocessed. Since they will not match correctly with words preprocessors that change words (e.g. normalization) must be applied to words too. """ # workaround to preprocess words # TODO: currently preprocessors work only on corpus, when there will be more # cases like this think about implementation of preprocessors for a list # of strings words_feature = StringVariable("words") words_c = Corpus( Domain([], metas=[words_feature]), metas=np.array([[w] for w in words]), text_features=[words_feature], ) # only transformers and normalizers preprocess on the word level pps = [ pp for pp in corpus.used_preprocessor.preprocessors if isinstance(pp, (BaseTransformer, BaseNormalizer)) ] for i, pp in enumerate(pps): words_c = pp(words_c) callback((i + 1) / len(pps)) return [w[0] for w in words_c.tokens if len(w)]
def dummy_fetch(self, cursors, max_tweets, search_author, callback): return ( Corpus( Domain([], metas=[StringVariable("Content")]), metas=np.array([["Abc"], ["Cde"], ["Gf"]]), ), 3, )
def create_corpus(texts: List[str]) -> Corpus: """ Create sample corpus with texts passed """ text_var = StringVariable("Text") domain = Domain([], metas=[text_var]) c = Corpus( domain, metas=np.array(texts).reshape(-1, 1), text_features=[text_var], ) return preprocess.LowercaseTransformer()(c)
def embedding_mock(_, corpus, __): if isinstance(corpus, list): return np.ones((len(corpus), 10)) else: # corpus is Corpus return ( Corpus( domain=Domain([ContinuousVariable(str(i)) for i in range(10)]), X=np.ones((len(corpus), 10)), ), None, )
def _create_simple_data(self) -> None: """ Creat a simple dataset with 4 documents. Save it to `self.corpus`. """ metas = np.array([ "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", "Duis viverra elit eu mi blandit, {et} sollicitudin nisi ", " a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a", "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per", ]).reshape(-1, 1) text_var = StringVariable("text") domain = Domain([], metas=[text_var]) self.corpus = Corpus( domain, X=np.empty((len(metas), 0)), metas=metas, text_features=[text_var], )
def _send_output(self, scores: np.ndarray, labels: List[str]) -> None: """ Create corpus with scores and output it """ if labels: d = self.corpus.domain domain = Domain( d.attributes, d.class_var, metas=d.metas + tuple(ContinuousVariable(l) for l in labels), ) corpus = Corpus( domain, self.corpus.X, self.corpus.Y, np.hstack([self.corpus.metas, scores]), ) Corpus.retain_preprocessing(self.corpus, corpus) self.Outputs.corpus.send(corpus) elif self.corpus is not None: self.Outputs.corpus.send(self.corpus) else: self.Outputs.corpus.send(None)