def w2v_cluster_tweet_vocab(filename, window=0, size=0, dataname="", n_components=0, min_count=1, rebuild=False): print "Clustering" x_data, y_data, stoplist, _ = make_x_y(filename, ["text"]) w2v_corpus = np.array([tu.normalize_punctuation(text).split() for text in x_data]) #w2v_model = build_w2v_model(w2v_corpus, dataname=dataname, window=window, size=size, min_count=min_count, # rebuild=rebuild, explore=False) dpgmm = transformers.DPGMMClusterModel(w2v_model=None, n_components=n_components, dataname=dataname, stoplist=stoplist, recluster_thresh=0, no_above=0.9, no_below=5, alpha=5) dpgmm.fit(w2v_corpus)
def build_w2v_model(w2v_corpus_list, dataname="", window=0, size=0, min_count=0, rebuild=False, explore=False): w2v_model_name = w2v_models.make_w2v_model_name(dataname=dataname, size=size, window=window, min_count=min_count) logging.info("Looking for model %s" % w2v_model_name) if (not rebuild or explore) and os.path.isfile(w2v_model_name): w2v_model = w2v_models.load_w2v(w2v_model_name) logging.info("Model Loaded") else: w2v_corpus = np.array([tu.normalize_punctuation(text).split() for text in np.concatenate(w2v_corpus_list)]) w2v_model = w2v_models.build_word2vec(w2v_corpus, size=size, window=window, min_count=min_count, dataname=dataname) logging.info("Model created") w2v_model.init_sims(replace=True) #check_w2v_model(w2v_model=w2v_model) return w2v_model
def build_and_vectorize_dpgmm(x_data=None, y_data=None, unlabeled_data=None, dataname="", n_components=0, rebuild=False, action="classify", stoplist=None, min_count=1, recluster_thresh=0, no_above=0.9, no_below=5): w2v_corpus = np.array([tu.normalize_punctuation(text).split() for text in np.concatenate([x_data, unlabeled_data])]) dpgmm = transformers.DPGMMClusterModel(w2v_model=None, n_components=n_components, dataname=dataname, stoplist=stoplist, recluster_thresh=0, no_above=no_above, no_below=no_below, alpha=5) pickle.dump(dpgmm, open(dataname+"_dpgmm", 'wb')) dpgmm.fit(w2v_corpus) dpgmm_data = dpgmm.transform(x_data) pickle.dump(dpgmm_data, open(dataname+"_dpgmm_data", 'wb')) return dpgmm_data, dpgmm.feature_crd
def check_w2v_model(filename="", w2v_model=None, window=0, size=0, min_count=1, dataname="", rebuild=True): print "Checking model for consistency" if w2v_model is None: x_data, y_data, stoplist = make_x_y(filename, ["text"]) w2v_corpus = np.array([tu.normalize_punctuation(text).split() for text in x_data]) logging.info("Pre-processing 2 done") logging.info("First line: %s" % w2v_corpus[0]) logging.info("Last line: %s" % w2v_corpus[-1]) w2v_model = build_w2v_model(w2v_corpus, dataname=dataname, window=window, size=size, min_count=min_count, rebuild=rebuild, explore=False) test_words = open("/Users/verasazonova/Work/PycharmProjects/tweet_mining/tweet_mining/tests.txt", 'r').readlines() for word_list in test_words: pos_words = word_list.split(':')[0].split() neg_words = word_list.split(':')[1].split() list_similar = w2v_models.test_word2vec(w2v_model, word_list=pos_words, neg_list=neg_words) print "%s - %s" % (pos_words, neg_words) for word, similarity in list_similar: print similarity, repr(word)