def transform(self, txts):
     res = []
     for txt in txts:
         # see https://github.com/RaRe-Technologies/gensim/issues/447
         self.d2v.random.seed(conf.SEED)
         v = self.d2v.infer_vector(micro_tokenize(normalize(txt)))
         res.append(v)
     return numpy.vstack(res)
def preprocess(row):
    if row[0] and row[1]:
        txt = row[0] + ' ' + row[1]
    elif row[0]:
        txt = row[0]
    elif row[1]:
        txt = row[1]
    else:
        txt = ''
    return micro_tokenize(normalize(txt))
 def transform(self, txts):
     data = {}
     unk = self.word2cid['UNK']
     for i, txt in enumerate(txts):
         words = micro_tokenize(normalize(txt))
         cids = [ self.word2cid.get(w, unk) for w in words ]
         for c in cids:
             if (i, c) in data:
                 data[(i, c)] += 1
             else:
                 data[(i, c)] = 1
     keys = sorted(data.keys())
     values = [ data[k] for k in keys ]
     row_ind = [ k[0] for k in keys ]
     col_ind = [ k[1] for k in keys ]
     X = scipy.sparse.csr_matrix((values, (row_ind, col_ind)),
         shape=(len(txts), self.n_clust))
     return X
Beispiel #4
0
def preprocess(txt):
    words = micro_tokenize(normalize(txt))
    # sequences of length 0 can make the training crash (tf.gather)
    if len(words) == 0:
        words = [ 'asdfasdf' ]
    return words