import numpy as np def extract_words(vectorizor, train_list, test_list): count_vect = vectorizor.fit(train_list) train = count_vect.transform(train_list) test = count_vect.transform(test_list) return train, test, count_vect.get_feature_names() set = 1 if __name__ == '__main__': if set == 1: articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip_mtn=False) elif set == 2: articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') elif set == 3: articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000) # Values y = extract_values(articleList, commentList, commentCount, set) sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42) y_train = [] y_test = [] for train, test in sss: np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList)
set = 3 if __name__ == '__main__': print 'START' # To process all the comments if set == 1: df_comments = read_news24_comments(comment_data_path + 'trainTestDataSet.txt') df_comments.sort('date', inplace=True) df_comments.reset_index(inplace=True, drop=True) df_thread_groupby = df_comments.groupby('thread_root_id') set_tag = "_news24" tag = '_main' elif set == 2: df_comments = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') df_comments.sort('date', inplace=True) df_comments.reset_index(inplace=True, drop=True) df_thread_groupby = df_comments.groupby('thread_root_id') set_tag = "_news24" tag = '_toy' elif set == 3: df_comments = read_slashdot_comments(comment_data_path + 'slashdotDataSet_latest.txt') df_comments.sort('date', inplace=True) df_comments.reset_index(inplace=True, drop=True) df_thread_groupby = df_comments.groupby('thread_root_id') set_tag = "_slashdot" tag = '_slashdot' print df_comments.shape
set = 3 if __name__ == '__main__': print 'START' # setup() # To process all the comments if set == 1: articleList, commentList, parentList, commentCount = read_comments( comment_data_path + 'trainTestDataSet.txt') set_tag = "_news24" tag = '_main' elif set == 2: articleList, commentList, parentList, commentCount = read_toy_comments( comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') set_tag = "_news24" tag = '_toy' elif set == 3: articleList, commentList, commentCount = read_slashdot_comments( comment_data_path + 'slashdotDataSet.txt', limit=100000) set_tag = "_slashdot" tag = '_slashdot' print "Processed", commentCount, "Comments" extractSaveValues(articleList, commentList, commentCount, feature_set_path + "valueVector" + tag, set) y = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '.npy') sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42)