def train_LDA(n_topics=100): ### Load extra data dataloader = csv_dataloader(datafile='data/extra_statuses.csv') if not os.path.exists('output/extra_cache.pk'): dataloader.read_csv(applyfun=preprocess, verbose=True) dataloader.save('output/extra_cache.pk') else: dataloader.load('output/extra_cache.pk') tokens = sum(dataloader.ldata.viewvalues(), []) print '#Tokens from training data: ' + str(len(tokens)) print 'Readin done' ### Get word2id first word2id = get_word2id() if not os.path.exists('word2id.pk'): word2id.fit(tokens) word2id.save('word2id.pk') else: word2id.load('word2id.pk') ids = word2id.ids() print "#Id: " + str(len(ids.keys())) ### Train LDA topics = get_topics(id2word=ids, method='lda', n_topics=n_topics) if not os.path.exists('output/lda_all_'+str(n_topics)+'.pk'): print 'Training LDA...' topics.fit(tokens) topics.save('output/lda_all_'+str(n_topics)+'.pk') topics.summary() else: topics.load('output/lda_all_'+str(n_topics)+'.pk')
def predict(): dataloader = csv_dataloader(datafile='data/test_statuses.csv', extrafile='data/test_metadata.csv', nolabel=True, extra=True) if not os.path.exists('output/test_cache.pk'): dataloader.read_csv(applyfun=preprocess) dataloader.save('output/test_cache.pk') else: dataloader.load('output/test_cache.pk') dataloader.summary() print "Read in finished" word2id = get_word2id() word2id.load('output/word2id.pk') ids = word2id.ids() n_topics = 100 topics = get_topics(id2word=ids, method='lda', n_topics=n_topics) topics.load('output/lda_all_100.pk') # ### Load pre-train word2vector model # word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300) # print 'Pretrained word2vec loaded' # # n_topics = 100 # model_file = 'output/clustering_gmm_100.pk'; # clusters = get_clusters(method='gmm', n_topics=n_topics) # print 'Cluster Model Loaded...' # clusters.load(model_file) ### Calculate LIWC hist LIWC = get_LIWC() test_id = dataloader.id test_data = dataloader.data_retrieve(test_id) ## Generate Test Data Encodings encode = encode_feature(test_data, test_id, [topics, LIWC]) print encode encode = preprocessing.scale(encode) classifier = load('output/model_LDA_100_0.461538461538.pk') predict_label = classifier.predict(encode) predict_prob = classifier.predict_proba(encode) with open('output/result.csv', 'w+') as file: file.write('userID, binaryPrediction, confidence, regression\n') for i in range(len(predict_label)): string = test_id[i] + ', ' if predict_label[i]==1: string += '+, ' else: string += '-, ' string += str(predict_prob[i][1]) + ', ' string += 'N\n' file.write(string) print str(sum(predict_label)) + '/' + str(len(predict_label))
def show_topics(n_topics=100): ### Get word2id first word2id = get_word2id() word2id.load('word2id.pk') ids = word2id.ids() print "#Id: " + str(len(ids.keys())) ### Show LDA topics = get_topics(id2word=ids, method='lda', n_topics=n_topics) topics.load('output/lda_all_100.pk') topics.summary(n_topics=100)
def main(n_fold=10): ### Load trainning data dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True) if not os.path.exists('output/data_cache.pk'): dataloader.read_csv(applyfun=preprocess) dataloader.save('output/data_cache.pk') else: dataloader.load('output/data_cache.pk') dataloader.summary() print "Read in finished" ### Get word2id first tokens = sum(dataloader.ldata.viewvalues(), []) word2id = get_word2id() if not os.path.exists('output/word2id.pk'): word2id.fit(tokens) word2id.save('output/word2id.pk') else: word2id.load('output/word2id.pk') ids = word2id.ids() print "#Id: " + str(len(ids.keys())) print '#Tokens from training data: ' + str(len(tokens)) ### Calculate LIWC hist LIWC = get_LIWC() #print LIWC.calculate_hist(tokens, normalize=False) ### Train and load LDA n_topics = 25 model_file = 'output/lda_all_25.pk' topics = get_topics(id2word=ids, method='lda', n_topics=n_topics) if not os.path.exists(model_file): print 'Training LDA...' topics.fit(tokens) topics.save(model_file) topics.summary() else: topics.load(model_file) ### ============================================================ ### n fold ### ============================================================ nfolds = dataloader.nfold(n_fold) fscores = [] models = [] for fold_ind in range(n_fold): print '======================== FOLD ' + str(fold_ind+1) + '========================' test_id = nfolds[fold_ind] train_id = [] for i in range(n_fold): if i != fold_ind: train_id += nfolds[i] ### ============================================================ ### Train Part ### ============================================================ print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>' train_data = dataloader.data_retrieve(train_id) ### Balance Train Data _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2) encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC]) encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4) label_pos = np.ones(len(encode_pos)) encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC]) label_neg = np.zeros(len(encode_neg)) encode = np.concatenate((encode_pos, encode_neg), axis=0) label = np.concatenate((label_pos, label_neg), axis=0) print encode.shape print label.shape ### Train encode = preprocessing.scale(encode) classifier = svm.LinearSVC(verbose=True) classifier.fit(encode, label) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) ### ============================================================ ### Test Part ### ============================================================ print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>' test_data = dataloader.data_retrieve(test_id) ### Generate Test Data Encodings encode = encode_feature(test_data, test_id, [topics, LIWC]) label = dataloader.label_retrieve(test_id) ### Test encode = preprocessing.scale(encode) print 'F1 score: ' + str(f1_score(label, classifier.predict(encode))) fscores.append(f1_score(label, classifier.predict(encode))) models.append(classifier) print 'MEAN F1 score: ' + str(np.mean(fscores)) print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1) print 'VAR F1 score: ' + str(np.var(fscores)) save(models[np.argmax(fscores)], 'output/model_LDA_' + str(n_topics) + '_' + str(fscores[np.argmax(fscores)]) + '.pk')