def pre_process(self, text): if self.no_dictionary: x_processed = tu.normalize_punctuation(text).split() else: x_processed = [word for word in tu.normalize_punctuation(text).split() if word in self.dictionary.token2id and word not in self.stoplist] return x_processed
def fit(self, X, y=None): logging.info("Loaded from file") x_clean = [tu.normalize_punctuation(text).split() for text in X] if self.w2v_model is None: self.w2v_model = w2v_models.build_word2vec(x_clean, size=100, window=10, min_count=1, dataname="test") if self.no_below == 1 and self.no_above == 1: self.no_dictionary = True else: self.dictionary = corpora.Dictionary(x_clean) self.dictionary.filter_extremes(no_above=self.no_above, no_below=self.no_below) # setting the coordinates for different models (start, stop) size = self.w2v_model.layer1_size self.feature_crd = {'00_avg': (0, size), '01_std': (size, 2*size)} feature_cnt = 2 start = 2*size l = size for i in range(1,self.diffmax0): #name = "%02d_diff0_%i" % (feature_cnt, i) #feature_cnt += 1 #val = (start, start + l) #self.feature_crd[name] = val #start += l name = "%02d_diff0_std_%i" % (feature_cnt, i) feature_cnt += 1 val = (start, start + l) self.feature_crd[name] = val start += l for i in range(1,self.diffmax1): name = "%02d_diff1_%i" % (feature_cnt, i) feature_cnt += 1 val = (start, start + l) self.feature_crd[name] = val start += l name = "%02d_diff1_std_%i" % (feature_cnt, i) feature_cnt += 1 val = (start, start + l) self.feature_crd[name] = val start += l self.length = start logging.info("Total feature length %i " % self.length ) logging.info("W2V: got a model %s " % (self.w2v_model,)) return self
def transform(self, X): # Text pre-processing x_clean = [tu.normalize_punctuation(text).split() for text in X] logging.info("DPGGM: Text prepocessed") # Vectorize using W2V model if self.dpgmm is not None: logging.info("Vectorizing a corpus") size = self.w2v_model.layer1_size if len(X) > 0: vecs = np.concatenate([self.clusterize(z) for z in x_clean], axis=0) else: vecs = np.zeros(size).reshape((1, size)) logging.info("DPGMM: returning pre-processed data of shape %s" % (vecs.shape, )) else: logging.info("W2V Averaged: no model was provided.") vecs = np.zeros((len(X), 1)) return vecs