コード例 #1
0
ファイル: test_owcorpus.py プロジェクト: RuhAm/orange3-text
    def test_output_status(self):
        """
        Test input, output info
        """
        # when input signal
        data = Corpus.from_file("election-tweets-2016")
        out_sum = self.widget.info.set_output_summary = Mock()

        self.send_signal(self.widget.Inputs.data, data)
        self.wait_until_finished()
        out_sum.assert_called_with(
            str(len(data)),
            "6444 document(s)\n4 text features(s)\n7 other feature(s)\n"
            "Classification; discrete class with 2 values.")
        out_sum.reset_mock()

        # corpus without class
        data1 = Corpus(Domain(data.domain.attributes, metas=data.domain.metas),
                       data.X,
                       metas=data.metas,
                       text_features=data.text_features)
        self.send_signal(self.widget.Inputs.data, data1)
        self.wait_until_finished()
        out_sum.assert_called_with(
            str(len(data)),
            "6444 document(s)\n4 text features(s)\n7 other feature(s)")
        out_sum.reset_mock()

        # corpus with continuous class
        data1 = Corpus(Domain(data.domain.attributes,
                              ContinuousVariable("a"),
                              metas=data.domain.metas),
                       data.X,
                       np.random.rand(len(data), 1),
                       metas=data.metas,
                       text_features=data.text_features)
        self.send_signal(self.widget.Inputs.data, data1)
        self.wait_until_finished()
        out_sum.assert_called_with(
            str(len(data)),
            "6444 document(s)\n4 text features(s)\n7 other feature(s)\n"
            "Regression; numerical class.")
        out_sum.reset_mock()

        # default dataset is on the output
        self.send_signal(self.widget.Inputs.data, None)
        self.wait_until_finished()
        out_sum.assert_called_with(
            "140", "140 document(s)\n1 text features(s)\n0 other feature(s)\n"
            "Classification; discrete class with 2 values.")
        out_sum.reset_mock()
コード例 #2
0
    def _send_output(self) -> None:
        """
        Create corpus with scores and output it
        """
        if self.corpus is None:
            self.Outputs.corpus.send(None)
            self.Outputs.selected_documents.send(None)
            return

        scores, labels = self._gather_scores()
        if labels:
            d = self.corpus.domain
            domain = Domain(
                d.attributes,
                d.class_var,
                metas=d.metas + tuple(
                    ContinuousVariable(get_unique_names(d, l))
                    for l in labels),
            )
            out_corpus = Corpus(
                domain,
                self.corpus.X,
                self.corpus.Y,
                np.hstack([self.corpus.metas, scores]),
            )
            Corpus.retain_preprocessing(self.corpus, out_corpus)
        else:
            out_corpus = self.corpus

        self.Outputs.corpus.send(
            create_annotated_table(out_corpus, self.selected_rows))
        self.Outputs.selected_documents.send(
            out_corpus[self.selected_rows] if self.selected_rows else None)
コード例 #3
0
def _preprocess_words(corpus: Corpus, words: List[str],
                      callback: Callable) -> List[str]:
    """
    Corpus's tokens can be preprocessed. Since they will not match correctly
    with words preprocessors that change words (e.g. normalization) must
    be applied to words too.
    """
    # workaround to preprocess words
    # TODO: currently preprocessors work only on corpus, when there will be more
    #  cases like this think about implementation of preprocessors for a list
    #  of strings
    words_feature = StringVariable("words")
    words_c = Corpus(
        Domain([], metas=[words_feature]),
        metas=np.array([[w] for w in words]),
        text_features=[words_feature],
    )
    # only transformers and normalizers preprocess on the word level
    pps = [
        pp for pp in corpus.used_preprocessor.preprocessors
        if isinstance(pp, (BaseTransformer, BaseNormalizer))
    ]
    for i, pp in enumerate(pps):
        words_c = pp(words_c)
        callback((i + 1) / len(pps))
    return [w[0] for w in words_c.tokens if len(w)]
コード例 #4
0
ファイル: test_owtwitter.py プロジェクト: RuhAm/orange3-text
def dummy_fetch(self, cursors, max_tweets, search_author, callback):
    return (
        Corpus(
            Domain([], metas=[StringVariable("Content")]),
            metas=np.array([["Abc"], ["Cde"], ["Gf"]]),
        ),
        3,
    )
コード例 #5
0
 def create_corpus(texts: List[str]) -> Corpus:
     """ Create sample corpus with texts passed """
     text_var = StringVariable("Text")
     domain = Domain([], metas=[text_var])
     c = Corpus(
         domain,
         metas=np.array(texts).reshape(-1, 1),
         text_features=[text_var],
     )
     return preprocess.LowercaseTransformer()(c)
コード例 #6
0
def embedding_mock(_, corpus, __):
    if isinstance(corpus, list):
        return np.ones((len(corpus), 10))
    else:  # corpus is Corpus
        return (
            Corpus(
                domain=Domain([ContinuousVariable(str(i)) for i in range(10)]),
                X=np.ones((len(corpus), 10)),
            ),
            None,
        )
コード例 #7
0
 def _create_simple_data(self) -> None:
     """
     Creat a simple dataset with 4 documents. Save it to `self.corpus`.
     """
     metas = np.array([
         "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
         "Duis viverra elit eu mi blandit, {et} sollicitudin nisi ",
         " a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a",
         "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per",
     ]).reshape(-1, 1)
     text_var = StringVariable("text")
     domain = Domain([], metas=[text_var])
     self.corpus = Corpus(
         domain,
         X=np.empty((len(metas), 0)),
         metas=metas,
         text_features=[text_var],
     )
コード例 #8
0
 def _send_output(self, scores: np.ndarray, labels: List[str]) -> None:
     """
     Create corpus with scores and output it
     """
     if labels:
         d = self.corpus.domain
         domain = Domain(
             d.attributes,
             d.class_var,
             metas=d.metas + tuple(ContinuousVariable(l) for l in labels),
         )
         corpus = Corpus(
             domain,
             self.corpus.X,
             self.corpus.Y,
             np.hstack([self.corpus.metas, scores]),
         )
         Corpus.retain_preprocessing(self.corpus, corpus)
         self.Outputs.corpus.send(corpus)
     elif self.corpus is not None:
         self.Outputs.corpus.send(self.corpus)
     else:
         self.Outputs.corpus.send(None)