def build_w2v_model(w2v_corpus_list, dataname="", window=0, size=0, min_count=0, rebuild=False, explore=False):
    w2v_model_name = w2v_models.make_w2v_model_name(dataname=dataname, size=size, window=window,
                                                    min_count=min_count)
    logging.info("Looking for model %s" % w2v_model_name)
    if (not rebuild or explore) and os.path.isfile(w2v_model_name):
        w2v_model = w2v_models.load_w2v(w2v_model_name)
        logging.info("Model Loaded")
    else:
        w2v_corpus = np.array([tu.normalize_punctuation(text).split() for text in np.concatenate(w2v_corpus_list)])
        w2v_model = w2v_models.build_word2vec(w2v_corpus, size=size, window=window, min_count=min_count, dataname=dataname)
        logging.info("Model created")
    w2v_model.init_sims(replace=True)

    #check_w2v_model(w2v_model=w2v_model)
    return w2v_model
Esempio n. 2
0
    def fit(self, X, y=None):

        logging.info("Loaded from file")

        x_clean = [tu.normalize_punctuation(text).split() for text in X]

        if self.w2v_model is None:
            self.w2v_model = w2v_models.build_word2vec(x_clean, size=100, window=10, min_count=1, dataname="test")

        if self.no_below == 1 and self.no_above == 1:
            self.no_dictionary = True
        else:
            self.dictionary = corpora.Dictionary(x_clean)
            self.dictionary.filter_extremes(no_above=self.no_above, no_below=self.no_below)

        # setting the coordinates for different models (start, stop)
        size = self.w2v_model.layer1_size
        self.feature_crd = {'00_avg': (0, size),
                            '01_std': (size, 2*size)}
        feature_cnt = 2
        start = 2*size
        l = size
        for i in range(1,self.diffmax0):
            #name = "%02d_diff0_%i" % (feature_cnt, i)
            #feature_cnt += 1
            #val = (start, start + l)
            #self.feature_crd[name] = val
            #start += l
            name = "%02d_diff0_std_%i" % (feature_cnt, i)
            feature_cnt += 1
            val = (start, start + l)
            self.feature_crd[name] = val
            start += l
        for i in range(1,self.diffmax1):
            name = "%02d_diff1_%i" % (feature_cnt, i)
            feature_cnt += 1
            val = (start, start + l)
            self.feature_crd[name] = val
            start += l
            name = "%02d_diff1_std_%i" % (feature_cnt, i)
            feature_cnt += 1
            val = (start, start + l)
            self.feature_crd[name] = val
            start += l
        self.length = start
        logging.info("Total feature length %i " % self.length )
        logging.info("W2V: got a model %s " % (self.w2v_model,))
        return self
Esempio n. 3
0
    def fit(self, X, y=None):
        # either consturct a dictionary from X, trim it
        if self.dictionary is None:
            self.dictionary = corpora.Dictionary(X)
        # or use an existing dictionary and trim the given set of words
        self.dictionary.filter_extremes(no_below=self.no_below, no_above=self.no_above, keep_n=9000)

        if self.w2v_model is None:
            w2v_corpus = [[word for word in text if self.should_cluster_word(word)] for text in X]
            self.w2v_model = w2v_models.build_word2vec(w2v_corpus, size=100, window=10, min_count=self.no_below,
                                                       dataname=self.dataname+"_dpgmm")

        word_list = np.array([word for word in self.dictionary.token2id.iterkeys() if self.should_cluster_word(word)])

        # This was  reclustering clause - I need to re-write this
        # else:
        #    # note the double loop here!!
        #    word_list = np.array([word for text in X for word in text if self.should_cluster_word(word)])

        # construct a list of words to cluster
        # remove rare and frequent words
        # remove words of length 1
        # remove stopwords
        vec_list = [self.w2v_model[word] for word in word_list]

        logging.info("DPGMM received %i words" % len(vec_list))

        # save word representations
        filename = "w2v_vocab_%s_%.1f_%.0f.lcsv" % (self.dataname, self.no_above, self.no_below)
        io.save_words_representations(filename, word_list, vec_list)

        self.scaler = StandardScaler()
        vecs = self.scaler.fit_transform(np.array(vec_list))

        self.dpgmm = DPGMM(n_components=self.n_components, covariance_type='diag', alpha=self.alpha,
                           n_iter=1000, tol=0.0001)
        self.dpgmm.fit(vecs)
        logging.info("DPGMM converged: %s" % self.dpgmm.converged_)


        # save information about found clusters
        self.cluster_info = []
        y_ = self.dpgmm.predict(vecs)

        for i, cluster_center in enumerate(self.dpgmm.means_):
            cluster_words = word_list[y_ == i]
            cluster_size = len(cluster_words)
            if cluster_size > self.recluster_thresh and self.recluster_thresh > 0:
                logging.info("DPGMM: reclustering %i words for cluster %i" % (len(cluster_words), i))
                sub_dpgmm = DPGMMClusterModel(w2v_model=self.w2v_model,
                                              n_components=self.n_sub_components,
                                              dictionary=self.dictionary,
                                              dataname="%s-%i" % (self.dataname, i), stoplist=self.stoplist)
                # recluster words.  Note the double array
                sub_dpgmm.fit([cluster_words])
                self.subdpgmms.append(sub_dpgmm)
                self.reclustered.append(i)
            if cluster_size > 0:
                #cluster_center_original = self.scaler.inverse_transform(cluster_center)
                #similar_words = self.w2v_model.most_similar_cosmul(positive=[cluster_center_original], topn=cluster_size)
                #central_words = [word for word, _ in similar_words if word in cluster_words]
                central_words = cluster_words[0:10]
            else:
                central_words = []
            self.cluster_info.append({'cnt': i, 'size': cluster_size, 'words': central_words})

        filename = "clusters_%s_%i_%.1f_%.0f.txt" % (self.dataname, self.n_components, self.no_above, self.no_below)
        io.save_cluster_info(filename, self.cluster_info)

        # setting up the coordinates for the features
        self.feature_crd = {'global': range(0, self.n_components),
                            'reclustered': [i for i in range(0, self.n_components + self.n_sub_components*len(self.reclustered))
                                            if i not in self.reclustered]}

        return self