def train_clustering(n_topics=100, method='gmm'): ### Load extra data dataloader = csv_dataloader(datafile='data/extra_statuses.csv') CACHE_FILE = 'output/extra_cache.pk' if not os.path.exists(CACHE_FILE): dataloader.read_csv(applyfun=preprocess, verbose=True) dataloader.save(CACHE_FILE) else: dataloader.load(CACHE_FILE) dataloader.summary() tokens = sum(dataloader.data.viewvalues(), []) print '#Tokens from training data: ' + str(len(tokens)) print 'Readin done' ### Load pre-train word2vector model word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) print 'Pretrained word2vec loaded' ### Convert word to vector train_vectors = word2vec.batch_convert(tokens) print '#Vectors from training data: ' + str(len(train_vectors)) save(train_vectors, 'output/extra_vectors.pk') ### Train Clustering clusters = get_clusters(method=method, n_topics=n_topics) if not os.path.exists('output/clustering_'+ method + '_' + str(n_topics) + '.pk'): print 'Training Clusters...' clusters.fit(train_vectors) clusters.save('output/clustering_'+ method + '_' + str(n_topics) + '.pk') clusters.summary() else: print 'Cluster Model Loaded...' clusters.load('output/clustering_'+ method + '_' + str(n_topics) + '.pk')
def draw_word2vec(): ### Load data dataloader = csv_dataloader() dataloader.load("output/data_cache.pk") print "Read in finished" ### Load pre-train word2vector model word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300) print "Pretrained word2vec loaded" all_tokens = sum(dataloader.data.viewvalues(), []) print "#Tokens: " + str(len(all_tokens)) fdist = FreqDist(all_tokens) tokens = fdist.keys()[1:500] print tokens tokens_has_vectors = [] for token in tokens: if word2vec[token] is not None: tokens_has_vectors.append(token) print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors)) vectors = word2vec.encode(tokens_has_vectors) print "#Unique Vectors: " + str(len(vectors)) print ("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000) # clf = manifold.Isomap(n_components=2, max_iter=100) vectors_mds = clf.fit_transform(vectors) print ("Done. Stress: %f" % clf.stress_) plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
def draw_gmm(): ### Load data dataloader = csv_dataloader() dataloader.load('output/data_cache.pk') print "Read in finished" ### Load pre-train word2vector model word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) print 'Pretrained word2vec loaded' ### Reverse engineering, build vector2word dictionary # vec2word = [] # words =[] # for voc, obj in word2vec.model.vocab: # words.append(voc) # vec2word.append(word2vec.model.syn0[obj.index]) all_vectors = word2vec.model.syn0 ### Train BoW n_topics = 25 model_file = 'output/clustering_gmm_100.pk'; clusters = get_clusters(method='gmm', n_topics=n_topics) clusters.load(model_file) print clusters.clusters.means_ knn = NearestNeighbors(n_neighbors=10) knn.fit(all_vectors) save(knn, 'output\draw_gmm_knn.pk') nns = knn.kneighbors(clusters.clusters.means_, return_distance=False) for i in range(np.shape(nns)[0]): print 'Topic ' + str(i+1) + ': ', for j in range(np.shape(nns)[1]): print str(word2vec.model.index2word[nns[i,j]]) + ' ', print ''
def main(n_fold=10): ### Load data dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True) if not os.path.exists('output/data_cache.pk'): dataloader.read_csv(applyfun=preprocess) dataloader.save('output/data_cache.pk') else: dataloader.load('output/data_cache.pk') dataloader.summary() print "Read in finished" ### Load pre-train word2vector model word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) print 'Pretrained word2vec loaded' ### Train BoW n_topics = 25 model_file = 'output/clustering_gmm_25.pk'; clusters = get_clusters(method='gmm', n_topics=n_topics) if not os.path.exists(model_file): ### Convert word to vector tokens = sum(dataloader.data.viewvalues(), []) print '#Tokens from training data: ' + str(len(tokens)) train_vectors = word2vec.encode(tokens) print '#Vectors from training data: ' + str(len(train_vectors)) print 'Training Clusters...' clusters.fit(train_vectors) clusters.save(model_file) clusters.summary() else: print 'Cluster Model Loaded...' clusters.load(model_file) ### Calculate LIWC hist LIWC = get_LIWC() #print LIWC.calculate_hist(tokens, normalize=False) ### ============================================================ ### n fold ### ============================================================ nfolds = dataloader.nfold(n_fold) fscores = [] models = [] for fold_ind in range(n_fold): print '======================== FOLD ' + str(fold_ind+1) + '========================' test_id = nfolds[fold_ind] train_id = [] for i in range(n_fold): if i != fold_ind: train_id += nfolds[i] ### ============================================================ ### Train Part ### ============================================================ print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>' train_data = dataloader.data_retrieve(train_id) ### Balance Train Data _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2) encode_pos = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC]) encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4) label_pos = np.ones(len(encode_pos)) encode_neg = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC]) label_neg = np.zeros(len(encode_neg)) encode = np.concatenate((encode_pos, encode_neg), axis=0) label = np.concatenate((label_pos, label_neg), axis=0) print encode.shape print label.shape encode = preprocessing.scale(encode) classifier = svm.LinearSVC(verbose=True) #weight = label+1 # pos:neg = 2:1 for imbalanced training classifier.fit(encode, label) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) ### ============================================================ ### Test Part ### ============================================================ print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>' test_data = dataloader.data_retrieve(test_id) encode = encode_feature(test_data, test_id, [word2vec, clusters, LIWC]) label = dataloader.label_retrieve(test_id) encode = preprocessing.scale(encode) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) fscores.append(f1_score(label, classifier.predict(encode))) models.append(classifier) print 'MEAN F1 score: ' + str(np.mean(fscores)) print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1) print 'VAR F1 score: ' + str(np.var(fscores)) save(models[np.argmax(fscores)], 'output/model_clustering_' + str(fscores[np.argmax(fscores)]) + '.pk')