예제 #1
0
  if set == 1:
      df_comments = read_news24_comments(comment_data_path + 'trainTestDataSet.txt')
      df_comments.sort('date', inplace=True)
      df_comments.reset_index(inplace=True, drop=True)
      df_thread_groupby = df_comments.groupby('thread_root_id')
      set_tag = "_news24"
      tag = '_main'
  elif set == 2:
      df_comments = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
      df_comments.sort('date', inplace=True)
      df_comments.reset_index(inplace=True, drop=True)
      df_thread_groupby = df_comments.groupby('thread_root_id')
      set_tag = "_news24"
      tag = '_toy'
  elif set == 3:
      df_comments = read_slashdot_comments(comment_data_path + 'slashdotDataSet_latest.txt')
      df_comments.sort('date', inplace=True)
      df_comments.reset_index(inplace=True, drop=True)
      df_thread_groupby = df_comments.groupby('thread_root_id')
      set_tag = "_slashdot"
      tag = '_slashdot'
      
 
  print df_comments.shape
  print df_comments.head()         
  
  
  # Get values and split train-test
  extractSaveValues(df_comments, feature_set_path + "valueVector" + tag, set)
  y = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '.npy') 
  sss = StratifiedShuffleSplit(y , 1, test_size=0.40, random_state=42)
예제 #2
0
Created on 22 Jul 2015

@author: Dirk
'''
from collections import Counter
import re

from FeatureExtraction.mainExtractor import read_slashdot_comments,\
    read_comments, read_toy_comments
from nltk.tokenize import word_tokenize

from config import comment_data_path


if __name__ == '__main__':
    articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', skip=False)
    #articleList, commentList, parList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip=True)
    #articleList, commentList, parList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
    
    
    totalComms = 0
    totalWords = 0
    totalArt = 0
    numberAnon = 0
    y_values = []
    for art in commentList.items():        
        totalArt += 1
        for comm in art[1]:
            totalComms += 1
            if comm.author.lower() == 'anonymous coward':
                numberAnon += 1
예제 #3
0
'''
Created on 22 Jul 2015

@author: Dirk
'''
from collections import Counter
import re

from FeatureExtraction.mainExtractor import read_slashdot_comments,\
    read_comments, read_toy_comments
from nltk.tokenize import word_tokenize

from config import comment_data_path

if __name__ == '__main__':
    articleList, commentList, commentCount = read_slashdot_comments(
        comment_data_path + 'slashdotDataSet.txt', skip=False)
    #articleList, commentList, parList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip=True)
    #articleList, commentList, parList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')

    totalComms = 0
    totalWords = 0
    totalArt = 0
    numberAnon = 0
    y_values = []
    for art in commentList.items():
        totalArt += 1
        for comm in art[1]:
            totalComms += 1
            if comm.author.lower() == 'anonymous coward':
                numberAnon += 1
            totalWords += len(word_tokenize(comm.body))
def extract_words(vectorizor, train_list, test_list):
    count_vect = vectorizor.fit(train_list)
    train = count_vect.transform(train_list)
    test = count_vect.transform(test_list)
        
    return train, test, count_vect.get_feature_names()

set = 1
if __name__ == '__main__':
    if set == 1:
        articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip_mtn=False)
    elif set == 2:
        articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000)
    
    # Values
    y = extract_values(articleList, commentList, commentCount, set)
    sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]
    
    processed_comment_list = extract_global_bag_of_words_processed(commentList)  
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
예제 #5
0
    # setup()

    # To process all the comments
    if set == 1:
        articleList, commentList, parentList, commentCount = read_comments(
            comment_data_path + 'trainTestDataSet.txt')
        set_tag = "_news24"
        tag = '_main'
    elif set == 2:
        articleList, commentList, parentList, commentCount = read_toy_comments(
            comment_data_path + 'trainTestDataSet.txt',
            comment_data_path + 'toyComments.csv')
        set_tag = "_news24"
        tag = '_toy'
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(
            comment_data_path + 'slashdotDataSet.txt', limit=100000)
        set_tag = "_slashdot"
        tag = '_slashdot'

    print "Processed", commentCount, "Comments"

    extractSaveValues(articleList, commentList, commentCount,
                      feature_set_path + "valueVector" + tag, set)
    y = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '.npy')
    sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42)
    for train, test in sss:
        print train
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]