if set == 1: df_comments = read_news24_comments(comment_data_path + 'trainTestDataSet.txt') df_comments.sort('date', inplace=True) df_comments.reset_index(inplace=True, drop=True) df_thread_groupby = df_comments.groupby('thread_root_id') set_tag = "_news24" tag = '_main' elif set == 2: df_comments = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') df_comments.sort('date', inplace=True) df_comments.reset_index(inplace=True, drop=True) df_thread_groupby = df_comments.groupby('thread_root_id') set_tag = "_news24" tag = '_toy' elif set == 3: df_comments = read_slashdot_comments(comment_data_path + 'slashdotDataSet_latest.txt') df_comments.sort('date', inplace=True) df_comments.reset_index(inplace=True, drop=True) df_thread_groupby = df_comments.groupby('thread_root_id') set_tag = "_slashdot" tag = '_slashdot' print df_comments.shape print df_comments.head() # Get values and split train-test extractSaveValues(df_comments, feature_set_path + "valueVector" + tag, set) y = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '.npy') sss = StratifiedShuffleSplit(y , 1, test_size=0.40, random_state=42)
Created on 22 Jul 2015 @author: Dirk ''' from collections import Counter import re from FeatureExtraction.mainExtractor import read_slashdot_comments,\ read_comments, read_toy_comments from nltk.tokenize import word_tokenize from config import comment_data_path if __name__ == '__main__': articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', skip=False) #articleList, commentList, parList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip=True) #articleList, commentList, parList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') totalComms = 0 totalWords = 0 totalArt = 0 numberAnon = 0 y_values = [] for art in commentList.items(): totalArt += 1 for comm in art[1]: totalComms += 1 if comm.author.lower() == 'anonymous coward': numberAnon += 1
''' Created on 22 Jul 2015 @author: Dirk ''' from collections import Counter import re from FeatureExtraction.mainExtractor import read_slashdot_comments,\ read_comments, read_toy_comments from nltk.tokenize import word_tokenize from config import comment_data_path if __name__ == '__main__': articleList, commentList, commentCount = read_slashdot_comments( comment_data_path + 'slashdotDataSet.txt', skip=False) #articleList, commentList, parList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip=True) #articleList, commentList, parList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') totalComms = 0 totalWords = 0 totalArt = 0 numberAnon = 0 y_values = [] for art in commentList.items(): totalArt += 1 for comm in art[1]: totalComms += 1 if comm.author.lower() == 'anonymous coward': numberAnon += 1 totalWords += len(word_tokenize(comm.body))
def extract_words(vectorizor, train_list, test_list): count_vect = vectorizor.fit(train_list) train = count_vect.transform(train_list) test = count_vect.transform(test_list) return train, test, count_vect.get_feature_names() set = 1 if __name__ == '__main__': if set == 1: articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip_mtn=False) elif set == 2: articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') elif set == 3: articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000) # Values y = extract_values(articleList, commentList, commentCount, set) sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42) y_train = [] y_test = [] for train, test in sss: np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = []
# setup() # To process all the comments if set == 1: articleList, commentList, parentList, commentCount = read_comments( comment_data_path + 'trainTestDataSet.txt') set_tag = "_news24" tag = '_main' elif set == 2: articleList, commentList, parentList, commentCount = read_toy_comments( comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') set_tag = "_news24" tag = '_toy' elif set == 3: articleList, commentList, commentCount = read_slashdot_comments( comment_data_path + 'slashdotDataSet.txt', limit=100000) set_tag = "_slashdot" tag = '_slashdot' print "Processed", commentCount, "Comments" extractSaveValues(articleList, commentList, commentCount, feature_set_path + "valueVector" + tag, set) y = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '.npy') sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42) for train, test in sss: print train np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test]