train = count_vect.transform(train_list)
    test = count_vect.transform(test_list)
        
    return train, test, count_vect.get_feature_names()

set = 1
if __name__ == '__main__':
    if set == 1:
        articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip_mtn=False)
    elif set == 2:
        articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000)
    
    # Values
    y = extract_values(articleList, commentList, commentCount, set)
    sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]
    
    processed_comment_list = extract_global_bag_of_words_processed(commentList)  
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
Example #2
0
def extractSaveValues(df_comments, filename, datatype):
    valueVector = extract_values(df_comments, datatype)
    print "Extracted values"
    save_numpy_matrix(filename, valueVector) 
Example #3
0
def extractSaveValues(articleList, commentList, commentCount, filename,
                      datatype):
    valueVector = extract_values(articleList, commentList, commentCount,
                                 datatype)
    print "Extracted values"
    save_numpy_matrix(filename, valueVector)
set = 1
if __name__ == '__main__':
    if set == 1:
        articleList, commentList, parentList, commentCount = read_comments(
            comment_data_path + 'trainTestDataSet.txt', skip_mtn=False)
    elif set == 2:
        articleList, commentList, parentList, commentCount = read_toy_comments(
            comment_data_path + 'trainTestDataSet.txt',
            comment_data_path + 'toyComments.csv')
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(
            comment_data_path + 'slashdotDataSet.txt', limit=100000)

    # Values
    y = extract_values(articleList, commentList, commentCount, set)
    sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]

    processed_comment_list = extract_global_bag_of_words_processed(commentList)
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
Example #5
0
def extractSaveValues(articleList, commentList, commentCount, filename, datatype):
    valueVector = extract_values(articleList, commentList, commentCount, datatype)
    print "Extracted values"
    save_numpy_matrix(filename, valueVector) 
    test = count_vect.transform(test_list)
    
    #print count_vect.get_feature_names()[1000:1010]
    
    #print count_vect.get_feature_names()
    print "Train:", train.shape    
    print "Test:", test.shape  
    print  count_vect.vocabulary_
    
    return train, test, count_vect.get_feature_names()

if __name__ == '__main__':
    articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt')
    
    # Values
    y = extract_values(articleList, commentList, parentList, commentCount)[:, 3]   
    sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        print train
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]
    
    processed_comment_list = extract_global_bag_of_words_processed(commentList)  
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
Example #7
0
    #print count_vect.get_feature_names()[1000:1010]

    #print count_vect.get_feature_names()
    print "Train:", train.shape
    print "Test:", test.shape
    print count_vect.vocabulary_

    return train, test, count_vect.get_feature_names()


if __name__ == '__main__':
    articleList, commentList, parentList, commentCount = read_comments(
        comment_data_path + 'trainTestDataSet.txt')

    # Values
    y = extract_values(articleList, commentList, parentList, commentCount)[:,
                                                                           3]
    sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        print train
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]

    processed_comment_list = extract_global_bag_of_words_processed(commentList)
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v: