def transform(self, txts): res = [] for txt in txts: # see https://github.com/RaRe-Technologies/gensim/issues/447 self.d2v.random.seed(conf.SEED) v = self.d2v.infer_vector(micro_tokenize(normalize(txt))) res.append(v) return numpy.vstack(res)
def preprocess(row): if row[0] and row[1]: txt = row[0] + ' ' + row[1] elif row[0]: txt = row[0] elif row[1]: txt = row[1] else: txt = '' return micro_tokenize(normalize(txt))
def transform(self, txts): data = {} unk = self.word2cid['UNK'] for i, txt in enumerate(txts): words = micro_tokenize(normalize(txt)) cids = [ self.word2cid.get(w, unk) for w in words ] for c in cids: if (i, c) in data: data[(i, c)] += 1 else: data[(i, c)] = 1 keys = sorted(data.keys()) values = [ data[k] for k in keys ] row_ind = [ k[0] for k in keys ] col_ind = [ k[1] for k in keys ] X = scipy.sparse.csr_matrix((values, (row_ind, col_ind)), shape=(len(txts), self.n_clust)) return X
def preprocess(txt): words = micro_tokenize(normalize(txt)) # sequences of length 0 can make the training crash (tf.gather) if len(words) == 0: words = [ 'asdfasdf' ] return words