Esempio n. 1
0
    def __call__(self, corpus: Corpus) -> Corpus:
        """
         Preprocess corpus. Should be extended when inherited and
         invoke _preprocess method on a document or token(s).

        :param corpus: Corpus
        :return: Corpus
            Preprocessed corpus.
        """
        ids = corpus.ids
        corpus = corpus.copy()
        corpus.ids = ids
        corpus.used_preprocessor = self
        return corpus
Esempio n. 2
0
    def __call__(self, corpus: Corpus, copy: bool = True,
                 processed_callback=None) -> Corpus:
        """Adds matrix of document embeddings to a corpus.

        Parameters
        ----------
        corpus : Corpus
            Corpus on which transform is performed.
        copy : bool
            If set to True, a copy of corpus is made.

        Returns
        -------
        Corpus
            Corpus (original or a copy) with new features added.

        Raises
        ------
        ValueError
            If corpus is not instance of Corpus.
        RuntimeError
            If document in corpus is larger than
            50 KB after compression.
        """
        if not isinstance(corpus, Corpus):
            raise ValueError("Input should be instance of Corpus.")
        corpus = corpus.copy() if copy else corpus
        embs = self._embedder.embedd_data(
            list(corpus.ngrams),
            processed_callback=processed_callback)

        dim = None
        send_warning = False
        for emb in embs:  # find embedding dimension
            if emb is not None:
                dim = len(emb)
                break
        # Check if some documents in corpus in weren't embedded
        # for some reason. This is a very rare case.
        inds = list()
        for i, emb in enumerate(embs):
            if emb is not None:
                inds.append(i)
            else:
                embs[i] = np.zeros(dim) * np.nan
                send_warning = True

        variable_attrs = {
            'hidden': True,
            'skip-normalization': True,
            'embedding-feature': True
        }
        embs = np.array(embs)
        new_corpus = corpus[inds]

        if len(inds) > 0:
            # if at least one embedding is not None,
            # extend attributes
            new_corpus.extend_attributes(
                np.array(embs[inds]),
                ['Dim{}'.format(i + 1) for i in range(dim)],
                var_attrs=variable_attrs)

        if send_warning:
            warnings.warn(("Some documents were not embedded for " +
                           "unknown reason. Those documents " +
                           "are skipped."),
                          RuntimeWarning)

        return new_corpus