Esempio n. 1
0
def extractTopicModelData(articleList, commentList, commentCount, set_tag, tag):
    processed_comment_list = extract_global_bag_of_words_processed(commentList)       
    print len(processed_comment_list)
    
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
    for v in test_v:
        test_list.append(processed_comment_list[v])    
        
    lda = models.LdaModel.load(model_path + set_tag.replace("_","") + "_lda_model")
    
    dictionary = corpora.Dictionary.load(model_path + set_tag.replace("_","") + "_dictionary")
    train = [dictionary.doc2bow(text) for text in train_list]
    test = [dictionary.doc2bow(text) for text in test_list]
    
    
    
    docTopicProbMat_train = lda[train]
    docTopicProbMat_test = lda[test]
    
    
    train_lda=matutils.corpus2dense(docTopicProbMat_train)
    test_lda=matutils.corpus2dense(docTopicProbMat_test)
      
    print train_lda.shape
    print test_lda.shape
    
    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train", train_lda) 
    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test", test_lda) 
    
    print "DONE LDA"
Esempio n. 2
0
def extractTopicModelData(articleList, commentList, commentCount, set_tag,
                          tag):
    processed_comment_list = extract_global_bag_of_words_processed(commentList)
    print len(processed_comment_list)

    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
    for v in test_v:
        test_list.append(processed_comment_list[v])

    lda = models.LdaModel.load(model_path + set_tag.replace("_", "") +
                               "_lda_model")

    dictionary = corpora.Dictionary.load(model_path +
                                         set_tag.replace("_", "") +
                                         "_dictionary")
    train = [dictionary.doc2bow(text) for text in train_list]
    test = [dictionary.doc2bow(text) for text in test_list]

    docTopicProbMat_train = lda[train]
    docTopicProbMat_test = lda[test]

    train_lda = matutils.corpus2dense(docTopicProbMat_train)
    test_lda = matutils.corpus2dense(docTopicProbMat_test)

    print train_lda.shape
    print test_lda.shape

    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train",
                    train_lda)
    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test",
                    test_lda)

    print "DONE LDA"
Esempio n. 3
0
def extractWordData(df_comments, tag):
    processed_comment_list = extract_global_bag_of_words_processed(df_comments)    
    print len(processed_comment_list)
    
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
    for v in test_v:
        test_list.append(processed_comment_list[v])        
        
    
    # train_list = [' '.join(sent) for sent in train_list]  
    # test_list = [' '.join(sent) for sent in test_list]    

    print len(train_list)
    print len(test_list)
    
    print 'Unigram Binary'
    bwd_train, bwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Unigram Frequency'
    fwd_train, fwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list)
    print 'Unigram TFIDF'
    twd_train, twd_test = extract_words(TfidfVectorizer(analyzer=UnigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Bigram Binary'
    bbwd_train, bbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Bigram TFIDF'
    btwd_train, btwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Trigram Binary'
    tbwd_train, tbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Trigram TFIDF'
    ttwd_train, ttwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Bigram Only Binary'
    bowd_train, bowd_test = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Bigram Only TFIDF'
    bowd2_train, bowd2_test = extract_words(TfidfVectorizer(analyzer=BigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Trigram Only Binary'
    towd_train, towd_test = extract_words(CountVectorizer(analyzer=TrigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Trigram Only TFIDF'
    towd2_train, towd2_test = extract_words(TfidfVectorizer(analyzer=TrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
  
    print(feature_set_path + "binaryWordData_train", bwd_train[123, :]) 
    print(feature_set_path + "freqWordData_train", fwd_train[123, :]) 
    print(feature_set_path + "tfidfWordData_train", twd_train[123, :]) 
    print(feature_set_path + "bigramBinaryWordData_train", bbwd_train[123, :]) 
    print(feature_set_path + "bigramTfidfWordData_train", btwd_train[123, :]) 
    print(feature_set_path + "trigramBinaryWordData_train", tbwd_train[123, :]) 
    print(feature_set_path + "trigramTfidfWordData_train", ttwd_train[123, :]) 
    
    print(feature_set_path + "bigramOnlyBinaryWordData_train", bowd_train[123, :])
    print(feature_set_path + "bigramOnlyTfidfWordData_train", bowd2_train[123, :])
    print(feature_set_path + "trigramOnlyBinaryWordData_train", towd_train[123, :])
    print(feature_set_path + "trigramOnlyTfidfWordData_train", towd2_train[123, :])
    
   
    
    
    save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_train", bwd_train) 
    save_sparse_csr(feature_set_path + "freqWordData" + tag + "_train", fwd_train) 
    save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_train", twd_train) 
    save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_train", bbwd_train) 
    save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_train", btwd_train) 
    save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_train", tbwd_train) 
    save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_train", ttwd_train)  
    
    save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_train", bowd_train)
    save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_train", bowd2_train)
    save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_train", towd_train)
    save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_train", towd2_train)
    
   
    save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_test", bwd_test) 
    save_sparse_csr(feature_set_path + "freqWordData" + tag + "_test", fwd_test) 
    save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_test", twd_test) 
    save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_test", bbwd_test) 
    save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_test", btwd_test) 
    save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_test", tbwd_test) 
    save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_test", ttwd_test) 
    
    save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_test", bowd_test)
    save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_test", bowd2_test)
    save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_test", towd_test)
    save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_test", towd2_test)
     articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
 elif set == 3:
     articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000)
 
 # Values
 y = extract_values(articleList, commentList, commentCount, set)
 sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42)
 y_train = []
 y_test = []
 for train, test in sss:
     np.save('train_vect', train)
     np.save('test_vect', test)
     y_train = y[train]
     y_test = y[test]
 
 processed_comment_list = extract_global_bag_of_words_processed(commentList)  
 train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
 train_list = []
 test_list = []
 for v in train_v:
     train_list.append(processed_comment_list[v])
 for v in test_v:
     test_list.append(processed_comment_list[v])
     
     
     
     
     
 train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list)  
 print train.shape 
 model = LDA()
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(
            comment_data_path + 'slashdotDataSet.txt', limit=100000)

    # Values
    y = extract_values(articleList, commentList, commentCount, set)
    sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]

    processed_comment_list = extract_global_bag_of_words_processed(commentList)
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
    for v in test_v:
        test_list.append(processed_comment_list[v])

    train, test, terms = extract_words(
        CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list,
        test_list)
    print train.shape
    model = LDA()
    model.fit(train.toarray(), y_train)
    values = []