train = count_vect.transform(train_list) test = count_vect.transform(test_list) return train, test, count_vect.get_feature_names() set = 1 if __name__ == '__main__': if set == 1: articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip_mtn=False) elif set == 2: articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') elif set == 3: articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000) # Values y = extract_values(articleList, commentList, commentCount, set) sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42) y_train = [] y_test = [] for train, test in sss: np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v])
def extractSaveValues(df_comments, filename, datatype): valueVector = extract_values(df_comments, datatype) print "Extracted values" save_numpy_matrix(filename, valueVector)
def extractSaveValues(articleList, commentList, commentCount, filename, datatype): valueVector = extract_values(articleList, commentList, commentCount, datatype) print "Extracted values" save_numpy_matrix(filename, valueVector)
set = 1 if __name__ == '__main__': if set == 1: articleList, commentList, parentList, commentCount = read_comments( comment_data_path + 'trainTestDataSet.txt', skip_mtn=False) elif set == 2: articleList, commentList, parentList, commentCount = read_toy_comments( comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') elif set == 3: articleList, commentList, commentCount = read_slashdot_comments( comment_data_path + 'slashdotDataSet.txt', limit=100000) # Values y = extract_values(articleList, commentList, commentCount, set) sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42) y_train = [] y_test = [] for train, test in sss: np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v])
test = count_vect.transform(test_list) #print count_vect.get_feature_names()[1000:1010] #print count_vect.get_feature_names() print "Train:", train.shape print "Test:", test.shape print count_vect.vocabulary_ return train, test, count_vect.get_feature_names() if __name__ == '__main__': articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt') # Values y = extract_values(articleList, commentList, parentList, commentCount)[:, 3] sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42) y_train = [] y_test = [] for train, test in sss: print train np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v:
#print count_vect.get_feature_names()[1000:1010] #print count_vect.get_feature_names() print "Train:", train.shape print "Test:", test.shape print count_vect.vocabulary_ return train, test, count_vect.get_feature_names() if __name__ == '__main__': articleList, commentList, parentList, commentCount = read_comments( comment_data_path + 'trainTestDataSet.txt') # Values y = extract_values(articleList, commentList, parentList, commentCount)[:, 3] sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42) y_train = [] y_test = [] for train, test in sss: print train np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: