def train_LDA(n_topics=100): ### Load extra data dataloader = csv_dataloader(datafile='data/extra_statuses.csv') if not os.path.exists('output/extra_cache.pk'): dataloader.read_csv(applyfun=preprocess, verbose=True) dataloader.save('output/extra_cache.pk') else: dataloader.load('output/extra_cache.pk') tokens = sum(dataloader.ldata.viewvalues(), []) print '#Tokens from training data: ' + str(len(tokens)) print 'Readin done' ### Get word2id first word2id = get_word2id() if not os.path.exists('word2id.pk'): word2id.fit(tokens) word2id.save('word2id.pk') else: word2id.load('word2id.pk') ids = word2id.ids() print "#Id: " + str(len(ids.keys())) ### Train LDA topics = get_topics(id2word=ids, method='lda', n_topics=n_topics) if not os.path.exists('output/lda_all_'+str(n_topics)+'.pk'): print 'Training LDA...' topics.fit(tokens) topics.save('output/lda_all_'+str(n_topics)+'.pk') topics.summary() else: topics.load('output/lda_all_'+str(n_topics)+'.pk')
def train_clustering(n_topics=100, method='gmm'): ### Load extra data dataloader = csv_dataloader(datafile='data/extra_statuses.csv') CACHE_FILE = 'output/extra_cache.pk' if not os.path.exists(CACHE_FILE): dataloader.read_csv(applyfun=preprocess, verbose=True) dataloader.save(CACHE_FILE) else: dataloader.load(CACHE_FILE) dataloader.summary() tokens = sum(dataloader.data.viewvalues(), []) print '#Tokens from training data: ' + str(len(tokens)) print 'Readin done' ### Load pre-train word2vector model word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) print 'Pretrained word2vec loaded' ### Convert word to vector train_vectors = word2vec.batch_convert(tokens) print '#Vectors from training data: ' + str(len(train_vectors)) save(train_vectors, 'output/extra_vectors.pk') ### Train Clustering clusters = get_clusters(method=method, n_topics=n_topics) if not os.path.exists('output/clustering_'+ method + '_' + str(n_topics) + '.pk'): print 'Training Clusters...' clusters.fit(train_vectors) clusters.save('output/clustering_'+ method + '_' + str(n_topics) + '.pk') clusters.summary() else: print 'Cluster Model Loaded...' clusters.load('output/clustering_'+ method + '_' + str(n_topics) + '.pk')
def train_wordfrequency(n_dims = 50): ### Load data dataloader = csv_dataloader() dataloader.load('output/data_cache.pk') print "Read in finished" train_id = dataloader.id _, pos_id, neg_id = dataloader.balance(train_id, 'full') train_data_pos = dataloader.data_retrieve(pos_id) train_data_neg = dataloader.data_retrieve(neg_id) tokens = sum(dataloader.data.viewvalues(), []) tokens_pos = sum(train_data_pos['data'].viewvalues(), []) tokens_neg = sum(train_data_neg['data'].viewvalues(), []) fdist_base = FreqDist(tokens) fdist_pos = FreqDist(tokens_pos) fdist_pos = normalize(fdist_pos, fdist_base) fdist_neg = FreqDist(tokens_neg) fdist_neg = normalize(fdist_neg, fdist_base) print list(fdist_pos.viewkeys())[:100] print list(fdist_neg.viewkeys())[:100] labels_pos = [1] * len(tokens_pos) labels_neg = [0] * len(tokens_neg) labels = labels_pos + labels_neg corpus = tokens_pos + tokens_neg
def draw_word2vec(): ### Load data dataloader = csv_dataloader() dataloader.load("output/data_cache.pk") print "Read in finished" ### Load pre-train word2vector model word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300) print "Pretrained word2vec loaded" all_tokens = sum(dataloader.data.viewvalues(), []) print "#Tokens: " + str(len(all_tokens)) fdist = FreqDist(all_tokens) tokens = fdist.keys()[1:500] print tokens tokens_has_vectors = [] for token in tokens: if word2vec[token] is not None: tokens_has_vectors.append(token) print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors)) vectors = word2vec.encode(tokens_has_vectors) print "#Unique Vectors: " + str(len(vectors)) print ("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000) # clf = manifold.Isomap(n_components=2, max_iter=100) vectors_mds = clf.fit_transform(vectors) print ("Done. Stress: %f" % clf.stress_) plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
def train_LLDA(n_topics=2): ### Load data dataloader = csv_dataloader() dataloader.load('output/data_cache.pk') print "Read in finished" train_id = dataloader.id train_data = dataloader.data_retrieve(train_id) labels = [] corpus = [] for id in train_id: corpus.append(train_data['ldata'][id]) labels.append(score2label(train_data['score'][id])) ### Train LLDA topics = get_topics2(method='LLDA', max_iter=2) model_file = 'output/LLDA_8.pk' if not os.path.exists(model_file): print 'Training LLDA...' topics.fit(corpus, labels, verbose=True) topics.save(model_file) else: topics.load(model_file) topics.summary()
def predict(): dataloader = csv_dataloader(datafile='data/test_statuses.csv', extrafile='data/test_metadata.csv', nolabel=True, extra=True) if not os.path.exists('output/test_cache.pk'): dataloader.read_csv(applyfun=preprocess) dataloader.save('output/test_cache.pk') else: dataloader.load('output/test_cache.pk') dataloader.summary() print "Read in finished" word2id = get_word2id() word2id.load('output/word2id.pk') ids = word2id.ids() n_topics = 100 topics = get_topics(id2word=ids, method='lda', n_topics=n_topics) topics.load('output/lda_all_100.pk') # ### Load pre-train word2vector model # word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) # print 'Pretrained word2vec loaded' # # n_topics = 100 # model_file = 'output/clustering_gmm_100.pk'; # clusters = get_clusters(method='gmm', n_topics=n_topics) # print 'Cluster Model Loaded...' # clusters.load(model_file) ### Calculate LIWC hist LIWC = get_LIWC() test_id = dataloader.id test_data = dataloader.data_retrieve(test_id) ## Generate Test Data Encodings encode = encode_feature(test_data, test_id, [topics, LIWC]) print encode encode = preprocessing.scale(encode) classifier = load('output/model_LDA_100_0.461538461538.pk') predict_label = classifier.predict(encode) predict_prob = classifier.predict_proba(encode) with open('output/result.csv', 'w+') as file: file.write('userID, binaryPrediction, confidence, regression\n') for i in range(len(predict_label)): string = test_id[i] + ', ' if predict_label[i]==1: string += '+, ' else: string += '-, ' string += str(predict_prob[i][1]) + ', ' string += 'N\n' file.write(string) print str(sum(predict_label)) + '/' + str(len(predict_label))
def draw_LIWC_hist(): ### Load data dataloader = csv_dataloader() dataloader.load('output/data_cache.pk') print "Read in finished" train_id = dataloader.id train_data = dataloader.data_retrieve(train_id) _, pos_id, neg_id = dataloader.balance(train_id, 'full') train_data_pos = dataloader.data_retrieve(pos_id) train_data_neg = dataloader.data_retrieve(neg_id) tokens = sum(train_data['data'].viewvalues(), []) tokens_pos = sum(train_data_pos['data'].viewvalues(), []) tokens_neg = sum(train_data_neg['data'].viewvalues(), []) ### Calculate LIWC hist LIWC = get_LIWC() LIWC_hist = LIWC.encode(tokens, normalize=False) LIWC_hist_pos = LIWC.encode(tokens_pos, normalize=True) LIWC_hist_neg = LIWC.encode(tokens_neg, normalize=True) fig = plt.figure() ax = fig.add_subplot(2,1,1) width = 0.3 bar0 = ax.bar(np.arange(67)+width, LIWC_hist, width) #ax.set_xlabel('Category') ax.set_ylabel('Frequency') ax.set_title('(a)') ax = fig.add_subplot(2,1,2) bar1 = ax.bar(np.arange(67)+width, LIWC_hist_pos, width, color='r') bar2 = ax.bar(np.arange(67)+2*width, LIWC_hist_neg, width, color='g') labels = list(LIWC.category.viewvalues()) ax.set_label(['Postive', 'Negative']) ax.set_xticks(np.arange(67)+2*width) ax.set_xticklabels(labels, rotation='vertical') ax.set_xlabel('Category') ax.set_ylabel('Percentage') ax.set_title('(b)') ax.grid(True) plt.show()
def draw_gmm(): ### Load data dataloader = csv_dataloader() dataloader.load('output/data_cache.pk') print "Read in finished" ### Load pre-train word2vector model word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) print 'Pretrained word2vec loaded' ### Reverse engineering, build vector2word dictionary # vec2word = [] # words =[] # for voc, obj in word2vec.model.vocab: # words.append(voc) # vec2word.append(word2vec.model.syn0[obj.index]) all_vectors = word2vec.model.syn0 ### Train BoW n_topics = 25 model_file = 'output/clustering_gmm_100.pk'; clusters = get_clusters(method='gmm', n_topics=n_topics) clusters.load(model_file) print clusters.clusters.means_ knn = NearestNeighbors(n_neighbors=10) knn.fit(all_vectors) save(knn, 'output\draw_gmm_knn.pk') nns = knn.kneighbors(clusters.clusters.means_, return_distance=False) for i in range(np.shape(nns)[0]): print 'Topic ' + str(i+1) + ': ', for j in range(np.shape(nns)[1]): print str(word2vec.model.index2word[nns[i,j]]) + ' ', print ''
def main(n_fold=10): ### Load trainning data dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True) if not os.path.exists('output/data_cache.pk'): dataloader.read_csv(applyfun=preprocess) dataloader.save('output/data_cache.pk') else: dataloader.load('output/data_cache.pk') dataloader.summary() print "Read in finished" ### Get word2id first tokens = sum(dataloader.ldata.viewvalues(), []) word2id = get_word2id() if not os.path.exists('output/word2id.pk'): word2id.fit(tokens) word2id.save('output/word2id.pk') else: word2id.load('output/word2id.pk') ids = word2id.ids() print "#Id: " + str(len(ids.keys())) print '#Tokens from training data: ' + str(len(tokens)) ### Calculate LIWC hist LIWC = get_LIWC() #print LIWC.calculate_hist(tokens, normalize=False) ### Train and load LDA n_topics = 25 model_file = 'output/lda_all_25.pk' topics = get_topics(id2word=ids, method='lda', n_topics=n_topics) if not os.path.exists(model_file): print 'Training LDA...' topics.fit(tokens) topics.save(model_file) topics.summary() else: topics.load(model_file) ### ============================================================ ### n fold ### ============================================================ nfolds = dataloader.nfold(n_fold) fscores = [] models = [] for fold_ind in range(n_fold): print '======================== FOLD ' + str(fold_ind+1) + '========================' test_id = nfolds[fold_ind] train_id = [] for i in range(n_fold): if i != fold_ind: train_id += nfolds[i] ### ============================================================ ### Train Part ### ============================================================ print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>' train_data = dataloader.data_retrieve(train_id) ### Balance Train Data _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2) encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC]) encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4) label_pos = np.ones(len(encode_pos)) encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC]) label_neg = np.zeros(len(encode_neg)) encode = np.concatenate((encode_pos, encode_neg), axis=0) label = np.concatenate((label_pos, label_neg), axis=0) print encode.shape print label.shape ### Train encode = preprocessing.scale(encode) classifier = svm.LinearSVC(verbose=True) classifier.fit(encode, label) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) ### ============================================================ ### Test Part ### ============================================================ print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>' test_data = dataloader.data_retrieve(test_id) ### Generate Test Data Encodings encode = encode_feature(test_data, test_id, [topics, LIWC]) label = dataloader.label_retrieve(test_id) ### Test encode = preprocessing.scale(encode) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) fscores.append(f1_score(label, classifier.predict(encode))) models.append(classifier) print 'MEAN F1 score: ' + str(np.mean(fscores)) print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1) print 'VAR F1 score: ' + str(np.var(fscores)) save(models[np.argmax(fscores)], 'output/model_LDA_' + str(n_topics) + '_' + str(fscores[np.argmax(fscores)]) + '.pk')
def main(n_fold=10): ### Load trainning data dataloader = csv_dataloader(extrafile="data/fixed_train_gender_class.csv", extra=True) if not os.path.exists("output/data_cache.pk"): dataloader.read_csv(applyfun=preprocess) dataloader.save("output/data_cache.pk") else: dataloader.load("output/data_cache.pk") dataloader.summary() print "Read in finished" ### Calculate LIWC hist LIWC = get_LIWC() # print LIWC.calculate_hist(tokens, normalize=False) ### Train and Load LLDA topics = get_topics2(method="LLDA", max_iter=20, n_topics=8) model_file = "output/LLDA_8_20.pk" if not os.path.exists(model_file): train_id = dataloader.id train_data = dataloader.data_retrieve(train_id) labels = [] corpus = [] for id in train_id: corpus.append(train_data["data"][id]) labels.append(score2label(train_data["score"][id])) print "Training LLDA..." topics.fit(corpus, labels, verbose=True) topics.save(model_file) print "Saved" else: topics.load(model_file) topics.summary() ### ============================================================ ### n fold ### ============================================================ nfolds = dataloader.nfold(n_fold) fscores = [] models = [] for fold_ind in range(n_fold): print "======================== FOLD " + str(fold_ind + 1) + "========================" test_id = nfolds[fold_ind] train_id = [] for i in range(n_fold): if i != fold_ind: train_id += nfolds[i] ### ============================================================ ### Train Part ### ============================================================ print "Training>>>>>>>>>>>>>>>>>>>>>>>>>" train_data = dataloader.data_retrieve(train_id) ### Balance Train Data _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2) encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC]) encode_pos = SMOTE(encode_pos, 200, len(train_pos_id) / 4) label_pos = np.ones(len(encode_pos)) encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC]) label_neg = np.zeros(len(encode_neg)) encode = np.concatenate((encode_pos, encode_neg), axis=0) label = np.concatenate((label_pos, label_neg), axis=0) print encode.shape print label.shape ### Train encode = preprocessing.scale(encode) classifier = svm.LinearSVC(verbose=True) classifier.fit(encode, label) print "F1 score: " + str(f1_score(label, classifier.predict(encode))) ### ============================================================ ### Test Part ### ============================================================ print "Testing>>>>>>>>>>>>>>>>>>>>>>>>>" test_data = dataloader.data_retrieve(test_id) ### Generate Test Data Encodings encode = encode_feature(test_data, test_id, [topics, LIWC]) label = dataloader.label_retrieve(test_id) ### Test encode = preprocessing.scale(encode) print "F1 score: " + str(f1_score(label, classifier.predict(encode))) fscores.append(f1_score(label, classifier.predict(encode))) models.append(classifier) print "MEAN F1 score: " + str(np.mean(fscores)) print "BEST F1 score: " + str(np.max(fscores)) + " by Model " + str(np.argmax(fscores) + 1) print "VAR F1 score: " + str(np.var(fscores)) save(models[np.argmax(fscores)], "output/model_LLDA_8_" + str(fscores[np.argmax(fscores)]) + ".pk")
def main(n_fold=10): ### Load data dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True) if not os.path.exists('output/data_cache.pk'): dataloader.read_csv(applyfun=preprocess) dataloader.save('output/data_cache.pk') else: dataloader.load('output/data_cache.pk') dataloader.summary() print "Read in finished" ### Load pre-train word2vector model word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) print 'Pretrained word2vec loaded' ### Train BoW n_topics = 25 model_file = 'output/clustering_gmm_25.pk'; clusters = get_clusters(method='gmm', n_topics=n_topics) if not os.path.exists(model_file): ### Convert word to vector tokens = sum(dataloader.data.viewvalues(), []) print '#Tokens from training data: ' + str(len(tokens)) train_vectors = word2vec.encode(tokens) print '#Vectors from training data: ' + str(len(train_vectors)) print 'Training Clusters...' clusters.fit(train_vectors) clusters.save(model_file) clusters.summary() else: print 'Cluster Model Loaded...' clusters.load(model_file) ### Calculate LIWC hist LIWC = get_LIWC() #print LIWC.calculate_hist(tokens, normalize=False) ### ============================================================ ### n fold ### ============================================================ nfolds = dataloader.nfold(n_fold) fscores = [] models = [] for fold_ind in range(n_fold): print '======================== FOLD ' + str(fold_ind+1) + '========================' test_id = nfolds[fold_ind] train_id = [] for i in range(n_fold): if i != fold_ind: train_id += nfolds[i] ### ============================================================ ### Train Part ### ============================================================ print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>' train_data = dataloader.data_retrieve(train_id) ### Balance Train Data _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2) encode_pos = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC]) encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4) label_pos = np.ones(len(encode_pos)) encode_neg = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC]) label_neg = np.zeros(len(encode_neg)) encode = np.concatenate((encode_pos, encode_neg), axis=0) label = np.concatenate((label_pos, label_neg), axis=0) print encode.shape print label.shape encode = preprocessing.scale(encode) classifier = svm.LinearSVC(verbose=True) #weight = label+1 # pos:neg = 2:1 for imbalanced training classifier.fit(encode, label) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) ### ============================================================ ### Test Part ### ============================================================ print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>' test_data = dataloader.data_retrieve(test_id) encode = encode_feature(test_data, test_id, [word2vec, clusters, LIWC]) label = dataloader.label_retrieve(test_id) encode = preprocessing.scale(encode) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) fscores.append(f1_score(label, classifier.predict(encode))) models.append(classifier) print 'MEAN F1 score: ' + str(np.mean(fscores)) print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1) print 'VAR F1 score: ' + str(np.var(fscores)) save(models[np.argmax(fscores)], 'output/model_clustering_' + str(fscores[np.argmax(fscores)]) + '.pk')