コード例 #1
0
ファイル: test_keywords.py プロジェクト: biolab/orange3-text
def corpus_mock(tokens):
    corpus = Corpus.from_numpy(
        Domain([], metas=[StringVariable("texts")]),
        np.empty((len(tokens), 0)),
        metas=np.array([[" ".join(t)] for t in tokens]),
    )
    return corpus
コード例 #2
0
    def _send_output(self) -> None:
        """
        Create corpus with scores and output it
        """
        if self.corpus is None:
            self.Outputs.corpus.send(None)
            self.Outputs.selected_documents.send(None)
            return

        scores, labels = self._gather_scores()
        if labels:
            d = self.corpus.domain
            domain = Domain(
                d.attributes,
                d.class_var,
                metas=d.metas + tuple(
                    ContinuousVariable(get_unique_names(d, l))
                    for l in labels),
            )
            out_corpus = Corpus.from_numpy(
                domain,
                self.corpus.X,
                self.corpus.Y,
                np.hstack([self.corpus.metas, scores]),
            )
            Corpus.retain_preprocessing(self.corpus, out_corpus)
        else:
            out_corpus = self.corpus

        self.Outputs.corpus.send(
            create_annotated_table(out_corpus, self.selected_rows))
        self.Outputs.selected_documents.send(
            out_corpus[self.selected_rows] if self.selected_rows else None)
コード例 #3
0
def _preprocess_words(corpus: Corpus, words: List[str],
                      callback: Callable) -> List[str]:
    """
    Corpus's tokens can be preprocessed. Since they will not match correctly
    with words preprocessors that change words (e.g. normalization) must
    be applied to words too.
    """
    # workaround to preprocess words
    # TODO: currently preprocessors work only on corpus, when there will be more
    #  cases like this think about implementation of preprocessors for a list
    #  of strings
    words_feature = StringVariable("words")
    words_c = Corpus.from_numpy(
        Domain([], metas=[words_feature]),
        np.empty((len(words), 0)),
        metas=np.array([[w] for w in words]),
        text_features=[words_feature],
    )
    # only transformers and normalizers preprocess on the word level
    pps = [
        pp for pp in corpus.used_preprocessor.preprocessors
        if isinstance(pp, (BaseTransformer, BaseNormalizer))
    ]
    for i, pp in enumerate(pps):
        words_c = pp(words_c)
        callback((i + 1) / len(pps))
    return [w[0] for w in words_c.tokens if len(w)]
コード例 #4
0
 def create_corpus(texts: List[str]) -> Corpus:
     """Create sample corpus with texts passed"""
     text_var = StringVariable("Text")
     domain = Domain([], metas=[text_var])
     c = Corpus.from_numpy(
         domain,
         X=np.empty((len(texts), 0)),
         metas=np.array(texts).reshape(-1, 1),
         text_features=[text_var],
     )
     return preprocess.LowercaseTransformer()(c)
コード例 #5
0
ファイル: owcreatecorpus.py プロジェクト: biolab/orange3-text
 def commit(self):
     """Create a new corpus and output it"""
     doc_var = StringVariable("Document")
     title_var = StringVariable("Title")
     domain = Domain([], metas=[title_var, doc_var])
     corpus = Corpus.from_numpy(
         domain,
         np.empty((len(self.texts), 0)),
         metas=np.array(self.texts),
         text_features=[doc_var],
     )
     corpus.set_title_variable(title_var)
     self.Outputs.corpus.send(corpus)
コード例 #6
0
 def _create_simple_data(self) -> None:
     """
     Creat a simple dataset with 4 documents. Save it to `self.corpus`.
     """
     metas = np.array([
         ["Lorem ipsum dolor sit amet, consectetur adipiscing elit."],
         ["Duis viverra elit eu mi blandit, {et} sollicitudin nisi "],
         [" a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a"],
         [
             "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per"
         ],
     ])
     text_var = StringVariable("text")
     domain = Domain([], metas=[text_var])
     self.corpus = Corpus.from_numpy(
         domain,
         X=np.empty((len(metas), 0)),
         metas=metas,
         text_features=[text_var],
     )
コード例 #7
0
ファイル: twitter.py プロジェクト: biolab/orange3-text
    def _create_corpus(self) -> Optional[Corpus]:
        if len(self.tweets) == 0:
            return None

        def to_val(attr, val):
            if isinstance(attr, DiscreteVariable):
                attr.val_from_str_add(val)
            return attr.to_val(val)

        m = [attr for attr, _ in METAS]
        domain = Domain(attributes=[], class_vars=[], metas=m)

        metas = np.array(
            [
                [to_val(attr, t) for (attr, _), t in zip(METAS, ts)]
                for ts in self.tweets.values()
            ],
            dtype=object,
        )
        x = np.empty((len(metas), 0))

        return Corpus.from_numpy(domain, x, metas=metas, text_features=self.text_features)
コード例 #8
0
ファイル: owkeywords.py プロジェクト: biolab/orange3-text
def run(
        corpus: Optional[Corpus],
        words: Optional[List],
        cached_keywords: Dict,
        scoring_methods: Set,
        scoring_methods_kwargs: Dict,
        agg_method: int,
        state: TaskState
) -> Results:
    results = Results(scores=[], labels=[], all_keywords={})
    if not corpus:
        return results

    # passed by reference (and not copied) - to save partial results
    results.all_keywords = cached_keywords
    if not scoring_methods:
        return results

    def callback(i: float, status=""):
        state.set_progress_value(i * 100)
        if status:
            state.set_status(status)
        if state.is_interruption_requested():
            raise Exception

    callback(0, "Calculating...")
    scores = {}
    documents = corpus.documents
    step = 1 / len(scoring_methods)
    for method_name, func in ScoringMethods.ITEMS:
        if method_name in scoring_methods:
            if method_name not in results.all_keywords:
                i = len(results.labels)
                cb = wrap_callback(callback, start=i * step,
                                   end=(i + 1) * step)

                needs_tokens = method_name in ScoringMethods.TOKEN_METHODS
                kw = {"progress_callback": cb}
                kw.update(scoring_methods_kwargs.get(method_name, {}))

                keywords = func(corpus if needs_tokens else documents, **kw)
                results.all_keywords[method_name] = keywords

            keywords = results.all_keywords[method_name]
            scores[method_name] = \
                dict(AggregationMethods.aggregate(keywords, agg_method))

            results.labels.append(method_name)

    scores = pd.DataFrame(scores)
    if words:

        # Normalize words
        for preprocessor in corpus.used_preprocessor.preprocessors:
            if isinstance(preprocessor, BaseNormalizer):
                dummy = Corpus.from_numpy(
                    Domain((), metas=[StringVariable("Words")]),
                    X=np.empty((len(words), 0)),
                    metas=np.array(words)[:, None]
                )
                words = list(preprocessor(dummy).tokens.flatten())

        # Filter scores using words
        existing_words = [w for w in set(words) if w in scores.index]
        scores = scores.loc[existing_words] if existing_words \
            else scores.iloc[:0]

    results.scores = scores.reset_index().sort_values(
        by=[results.labels[0], "index"],
        ascending=[False, True]
    ).values.tolist()

    return results