def transform(self, X, Y=None): success, _X = self.from_cache(X) if not success: _X = [] for document in X: sents = splitter(document) blocks, block = [], [] while sents: sent = sents.pop(0) block.append(sent) if len(block) == self.block_size: blocks.append(' '.join(block)) block = [] blocks.append(' '.join(block)) encoded_blocks = self.model.encode(blocks) doc_vecs = pooling( encoded_blocks, (encoded_blocks.shape[0], 1), method='avg' ) _X.append(list(doc_vecs)[0]) self.to_cache(X, _X) if Y: return np.array(_X), Y else: return np.array(_X)
def __call__(self, x): bow = [] for sent in splitter(x): for i, token in enumerate(tokenizer(sent)): if self.remove_nonalpha and not token.isalpha(): continue if (self.remove_entities and i and token[0] != token[0].lower()): continue if (self.remove_stopwords and token.lower() in STOPWORDS): continue bow.append(token if not self.lowercase else token.lower()) _bow = [] prev = None while bow: token = bow.pop(0) if token == prev: continue _bow.append(token) prev = token return ' '.join(_bow)
def transform(self, X, Y=None): success, _X = self.from_cache(X) if not success: _X = [] for document in X: sents = splitter(document) encoded_sents = self.model.encode(sents) doc_vecs = pooling( encoded_sents, (encoded_sents.shape[0], 1), method='mdn' ) _X.append(list(doc_vecs)[0]) self.to_cache(X, _X) if Y: return np.array(_X), Y else: return np.array(_X)
def title(X, docid): return splitter(X[docid])[0] return '%s...' % X[docid][:90]