예제 #1
0
def extractCharacterData(df_comments, tag):
    comment_list = extract_global_bag_of_words(df_comments)   
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(comment_list[v])
    for v in test_v:
        test_list.append(comment_list[v])
        
    print len(train_list)
    print len(test_list)
    
    print 'Character Ngrams Binary'
    cb_train, cb_test = extract_words(CountVectorizer(analyzer=CharacterAnalyzer(), binary=True, dtype=float), train_list, test_list)
    save_sparse_csr(feature_set_path + "binaryCharacterData" + tag + "_train", cb_train) 
    save_sparse_csr(feature_set_path + "binaryCharacterData" + tag + "_test", cb_test) 
    
    print 'Character Ngrams tfidf'
    ct_train, ct_test = extract_words(TfidfVectorizer(analyzer=CharacterAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)    
    save_sparse_csr(feature_set_path + "tfidfCharacterData" + tag + "_train", ct_train) 
    save_sparse_csr(feature_set_path + "tfidfCharacterData" + tag + "_test", ct_test) 
     
    print 'Character skipgrams Binary'
    sb_train, sb_test = extract_words(CountVectorizer(analyzer=CharacterSkipGramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    save_sparse_csr(feature_set_path + "binaryCharacterSkipgramData" + tag + "_train", sb_train) 
    save_sparse_csr(feature_set_path + "binaryCharacterSkipgramData" + tag + "_test", sb_test) 
    
    print 'Character skipgrams TFIDF'
    sb_train, sb_test = extract_words(TfidfVectorizer(analyzer=CharacterSkipGramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    save_sparse_csr(feature_set_path + "tfidfCharacterSkipgramData" + tag + "_train", sb_train) 
    save_sparse_csr(feature_set_path + "tfidfCharacterSkipgramData" + tag + "_test", sb_test)     
예제 #2
0
def extractLexicalBigramData(articleList, commentList, commentCount):
    comment_list = extract_global_bag_of_words(commentList)
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(comment_list[v])
    for v in test_v:
        test_list.append(comment_list[v])

    print len(train_list)
    print len(test_list)

    print 'Lexical Ngrams Binary'
    cb_train, cb_test = extract_words(
        CountVectorizer(analyzer=LexicalBigramUnigramAnalyzer(),
                        binary=True,
                        dtype=float), train_list, test_list)
    save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_train",
                    cb_train)
    save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_test",
                    cb_test)

    print 'Lexical Ngrams tfidf'
    ct_train, ct_test = extract_words(
        TfidfVectorizer(analyzer=LexicalBigramUnigramAnalyzer(),
                        use_idf=True,
                        smooth_idf=True,
                        dtype=float), train_list, test_list)
    save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_train",
                    ct_train)
    save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_test", ct_test)
예제 #3
0
def extractLexicalBigramData(articleList, commentList, commentCount):
    comment_list = extract_global_bag_of_words(commentList)   
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(comment_list[v])
    for v in test_v:
        test_list.append(comment_list[v])
        
    print len(train_list)
    print len(test_list)
    
    print 'Lexical Ngrams Binary'
    cb_train, cb_test = extract_words(CountVectorizer(analyzer=LexicalBigramUnigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_train", cb_train) 
    save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_test", cb_test) 
    
    print 'Lexical Ngrams tfidf'
    ct_train, ct_test = extract_words(TfidfVectorizer(analyzer=LexicalBigramUnigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)    
    save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_train", ct_train) 
    save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_test", ct_test)
예제 #4
0
def extractWordData(df_comments, tag):
    processed_comment_list = extract_global_bag_of_words_processed(df_comments)    
    print len(processed_comment_list)
    
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
    for v in test_v:
        test_list.append(processed_comment_list[v])        
        
    
    # train_list = [' '.join(sent) for sent in train_list]  
    # test_list = [' '.join(sent) for sent in test_list]    

    print len(train_list)
    print len(test_list)
    
    print 'Unigram Binary'
    bwd_train, bwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Unigram Frequency'
    fwd_train, fwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list)
    print 'Unigram TFIDF'
    twd_train, twd_test = extract_words(TfidfVectorizer(analyzer=UnigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Bigram Binary'
    bbwd_train, bbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Bigram TFIDF'
    btwd_train, btwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Trigram Binary'
    tbwd_train, tbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Trigram TFIDF'
    ttwd_train, ttwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Bigram Only Binary'
    bowd_train, bowd_test = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Bigram Only TFIDF'
    bowd2_train, bowd2_test = extract_words(TfidfVectorizer(analyzer=BigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
    print 'Trigram Only Binary'
    towd_train, towd_test = extract_words(CountVectorizer(analyzer=TrigramAnalyzer(), binary=True, dtype=float), train_list, test_list)
    print 'Trigram Only TFIDF'
    towd2_train, towd2_test = extract_words(TfidfVectorizer(analyzer=TrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list)
  
    print(feature_set_path + "binaryWordData_train", bwd_train[123, :]) 
    print(feature_set_path + "freqWordData_train", fwd_train[123, :]) 
    print(feature_set_path + "tfidfWordData_train", twd_train[123, :]) 
    print(feature_set_path + "bigramBinaryWordData_train", bbwd_train[123, :]) 
    print(feature_set_path + "bigramTfidfWordData_train", btwd_train[123, :]) 
    print(feature_set_path + "trigramBinaryWordData_train", tbwd_train[123, :]) 
    print(feature_set_path + "trigramTfidfWordData_train", ttwd_train[123, :]) 
    
    print(feature_set_path + "bigramOnlyBinaryWordData_train", bowd_train[123, :])
    print(feature_set_path + "bigramOnlyTfidfWordData_train", bowd2_train[123, :])
    print(feature_set_path + "trigramOnlyBinaryWordData_train", towd_train[123, :])
    print(feature_set_path + "trigramOnlyTfidfWordData_train", towd2_train[123, :])
    
   
    
    
    save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_train", bwd_train) 
    save_sparse_csr(feature_set_path + "freqWordData" + tag + "_train", fwd_train) 
    save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_train", twd_train) 
    save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_train", bbwd_train) 
    save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_train", btwd_train) 
    save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_train", tbwd_train) 
    save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_train", ttwd_train)  
    
    save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_train", bowd_train)
    save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_train", bowd2_train)
    save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_train", towd_train)
    save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_train", towd2_train)
    
   
    save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_test", bwd_test) 
    save_sparse_csr(feature_set_path + "freqWordData" + tag + "_test", fwd_test) 
    save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_test", twd_test) 
    save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_test", bbwd_test) 
    save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_test", btwd_test) 
    save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_test", tbwd_test) 
    save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_test", ttwd_test) 
    
    save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_test", bowd_test)
    save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_test", bowd2_test)
    save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_test", towd_test)
    save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_test", towd2_test)