def build_w2v_model(w2v_corpus_list, dataname="", window=0, size=0, min_count=0, rebuild=False, explore=False): w2v_model_name = w2v_models.make_w2v_model_name(dataname=dataname, size=size, window=window, min_count=min_count) logging.info("Looking for model %s" % w2v_model_name) if (not rebuild or explore) and os.path.isfile(w2v_model_name): w2v_model = w2v_models.load_w2v(w2v_model_name) logging.info("Model Loaded") else: w2v_corpus = np.array([tu.normalize_punctuation(text).split() for text in np.concatenate(w2v_corpus_list)]) w2v_model = w2v_models.build_word2vec(w2v_corpus, size=size, window=window, min_count=min_count, dataname=dataname) logging.info("Model created") w2v_model.init_sims(replace=True) #check_w2v_model(w2v_model=w2v_model) return w2v_model
def fit(self, X, y=None): logging.info("Loaded from file") x_clean = [tu.normalize_punctuation(text).split() for text in X] if self.w2v_model is None: self.w2v_model = w2v_models.build_word2vec(x_clean, size=100, window=10, min_count=1, dataname="test") if self.no_below == 1 and self.no_above == 1: self.no_dictionary = True else: self.dictionary = corpora.Dictionary(x_clean) self.dictionary.filter_extremes(no_above=self.no_above, no_below=self.no_below) # setting the coordinates for different models (start, stop) size = self.w2v_model.layer1_size self.feature_crd = {'00_avg': (0, size), '01_std': (size, 2*size)} feature_cnt = 2 start = 2*size l = size for i in range(1,self.diffmax0): #name = "%02d_diff0_%i" % (feature_cnt, i) #feature_cnt += 1 #val = (start, start + l) #self.feature_crd[name] = val #start += l name = "%02d_diff0_std_%i" % (feature_cnt, i) feature_cnt += 1 val = (start, start + l) self.feature_crd[name] = val start += l for i in range(1,self.diffmax1): name = "%02d_diff1_%i" % (feature_cnt, i) feature_cnt += 1 val = (start, start + l) self.feature_crd[name] = val start += l name = "%02d_diff1_std_%i" % (feature_cnt, i) feature_cnt += 1 val = (start, start + l) self.feature_crd[name] = val start += l self.length = start logging.info("Total feature length %i " % self.length ) logging.info("W2V: got a model %s " % (self.w2v_model,)) return self
def fit(self, X, y=None): # either consturct a dictionary from X, trim it if self.dictionary is None: self.dictionary = corpora.Dictionary(X) # or use an existing dictionary and trim the given set of words self.dictionary.filter_extremes(no_below=self.no_below, no_above=self.no_above, keep_n=9000) if self.w2v_model is None: w2v_corpus = [[word for word in text if self.should_cluster_word(word)] for text in X] self.w2v_model = w2v_models.build_word2vec(w2v_corpus, size=100, window=10, min_count=self.no_below, dataname=self.dataname+"_dpgmm") word_list = np.array([word for word in self.dictionary.token2id.iterkeys() if self.should_cluster_word(word)]) # This was reclustering clause - I need to re-write this # else: # # note the double loop here!! # word_list = np.array([word for text in X for word in text if self.should_cluster_word(word)]) # construct a list of words to cluster # remove rare and frequent words # remove words of length 1 # remove stopwords vec_list = [self.w2v_model[word] for word in word_list] logging.info("DPGMM received %i words" % len(vec_list)) # save word representations filename = "w2v_vocab_%s_%.1f_%.0f.lcsv" % (self.dataname, self.no_above, self.no_below) io.save_words_representations(filename, word_list, vec_list) self.scaler = StandardScaler() vecs = self.scaler.fit_transform(np.array(vec_list)) self.dpgmm = DPGMM(n_components=self.n_components, covariance_type='diag', alpha=self.alpha, n_iter=1000, tol=0.0001) self.dpgmm.fit(vecs) logging.info("DPGMM converged: %s" % self.dpgmm.converged_) # save information about found clusters self.cluster_info = [] y_ = self.dpgmm.predict(vecs) for i, cluster_center in enumerate(self.dpgmm.means_): cluster_words = word_list[y_ == i] cluster_size = len(cluster_words) if cluster_size > self.recluster_thresh and self.recluster_thresh > 0: logging.info("DPGMM: reclustering %i words for cluster %i" % (len(cluster_words), i)) sub_dpgmm = DPGMMClusterModel(w2v_model=self.w2v_model, n_components=self.n_sub_components, dictionary=self.dictionary, dataname="%s-%i" % (self.dataname, i), stoplist=self.stoplist) # recluster words. Note the double array sub_dpgmm.fit([cluster_words]) self.subdpgmms.append(sub_dpgmm) self.reclustered.append(i) if cluster_size > 0: #cluster_center_original = self.scaler.inverse_transform(cluster_center) #similar_words = self.w2v_model.most_similar_cosmul(positive=[cluster_center_original], topn=cluster_size) #central_words = [word for word, _ in similar_words if word in cluster_words] central_words = cluster_words[0:10] else: central_words = [] self.cluster_info.append({'cnt': i, 'size': cluster_size, 'words': central_words}) filename = "clusters_%s_%i_%.1f_%.0f.txt" % (self.dataname, self.n_components, self.no_above, self.no_below) io.save_cluster_info(filename, self.cluster_info) # setting up the coordinates for the features self.feature_crd = {'global': range(0, self.n_components), 'reclustered': [i for i in range(0, self.n_components + self.n_sub_components*len(self.reclustered)) if i not in self.reclustered]} return self