def __call__(self, corpus: Corpus) -> Corpus: """ Preprocess corpus. Should be extended when inherited and invoke _preprocess method on a document or token(s). :param corpus: Corpus :return: Corpus Preprocessed corpus. """ ids = corpus.ids corpus = corpus.copy() corpus.ids = ids corpus.used_preprocessor = self return corpus
def __call__(self, corpus: Corpus, copy: bool = True, processed_callback=None) -> Corpus: """Adds matrix of document embeddings to a corpus. Parameters ---------- corpus : Corpus Corpus on which transform is performed. copy : bool If set to True, a copy of corpus is made. Returns ------- Corpus Corpus (original or a copy) with new features added. Raises ------ ValueError If corpus is not instance of Corpus. RuntimeError If document in corpus is larger than 50 KB after compression. """ if not isinstance(corpus, Corpus): raise ValueError("Input should be instance of Corpus.") corpus = corpus.copy() if copy else corpus embs = self._embedder.embedd_data( list(corpus.ngrams), processed_callback=processed_callback) dim = None send_warning = False for emb in embs: # find embedding dimension if emb is not None: dim = len(emb) break # Check if some documents in corpus in weren't embedded # for some reason. This is a very rare case. inds = list() for i, emb in enumerate(embs): if emb is not None: inds.append(i) else: embs[i] = np.zeros(dim) * np.nan send_warning = True variable_attrs = { 'hidden': True, 'skip-normalization': True, 'embedding-feature': True } embs = np.array(embs) new_corpus = corpus[inds] if len(inds) > 0: # if at least one embedding is not None, # extend attributes new_corpus.extend_attributes( np.array(embs[inds]), ['Dim{}'.format(i + 1) for i in range(dim)], var_attrs=variable_attrs) if send_warning: warnings.warn(("Some documents were not embedded for " + "unknown reason. Those documents " + "are skipped."), RuntimeWarning) return new_corpus