Beispiel #1
0
def test_chi2():
    # Test Chi2 feature extraction

    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    assert_array_equal(chi2.get_support(indices=True), [0])
    assert_array_equal(chi2.transform(X), np.array(X)[:, [0]])

    chi2 = mkchi2(k=2).fit(X, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])

    Xsp = csr_matrix(X, dtype=np.float64)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
    Xtrans = chi2.transform(Xsp)
    assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2])

    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_array_almost_equal(Xtrans, Xtrans2)
Beispiel #2
0
def test_chi2():
    # Test Chi2 feature extraction

    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    assert_array_equal(chi2.get_support(indices=True), [0])
    assert_array_equal(chi2.transform(X), np.array(X)[:, [0]])

    chi2 = mkchi2(k=2).fit(X, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])

    Xsp = csr_matrix(X, dtype=np.float64)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
    Xtrans = chi2.transform(Xsp)
    assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2])

    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_array_almost_equal(Xtrans, Xtrans2)
def ex_feature(train_set,test_set,t_train,t_test,hash=False,use_tf=False,K=2000):
    '''
    extract feature from train_set,test_set,using term frequency or tf-idf.
    And a stop_word.txt is necessary
    :param train_set:numpy array or sparse matrix of shape [n_samples,n_features]
                    Training data
    :param test_set:numpy array or sparse matrix of shape [n_samples,n_features]
                    Training data
    :param t_train:numpy array of shape [n_samples, n_targets]
                    Target values
    :param t_test:numpy array of shape [n_samples, n_targets]
                    Target values
    :param hash: use HashingVectorizer
    :param use_tf: use term frequency to descend dimensions
    :param K:select k best features based on cki2,only used if ``use_tf == 'False'``
    :return:train_Set and test_set after extracting features
    '''
    with open('chinese_stopword.txt', 'r', encoding='utf-8-sig') as f:
        stop_words = list(f.read().splitlines())
        data_train_size_mb = size_mb(train_set)
        data_test_size_mb = size_mb(test_set)
        start_time = time.time()

        print('extracting features......')
        if hash:
            from sklearn.feature_extraction.text import HashingVectorizer
            vectorizer = HashingVectorizer(non_negative=True)
            x_train = vectorizer.fit_transform(train_set)
            x_test = vectorizer.fit_transform(test_set)
        else:
            tfidf_transformer = TfidfTransformer()
            if use_tf:
                vectorizer = CountVectorizer(max_features=K,stop_words=stop_words, decode_error='strict')
                x_train_tf_matrix = vectorizer.fit_transform(train_set)
                x_train = tfidf_transformer.fit_transform(x_train_tf_matrix)
                x_test_tf_matrix = vectorizer.transform(test_set)#共用一个vectorizer
                x_test = tfidf_transformer.fit_transform(x_test_tf_matrix)
            else:
                from sklearn.feature_selection import SelectKBest
                from sklearn.feature_selection import chi2
                from sklearn.feature_extraction.text import TfidfVectorizer
                vectorizer = TfidfVectorizer(stop_words=stop_words)
                x_train_tfidf_matrix = vectorizer.fit_transform(train_set)
                x_test_tfidf_matrix = vectorizer.transform(test_set)
                chi2=SelectKBest(chi2, k=K)
                x_train = chi2.fit_transform(x_train_tfidf_matrix, t_train)
                x_test = chi2.transform(x_test_tfidf_matrix)

        end_time=time.time()

        print('extract features took %.2f s  at %0.2fMB/S' % ( (time.time() - start_time), (data_train_size_mb+data_test_size_mb) / (end_time-start_time)))
        return x_train, x_test
Beispiel #4
0
def test_chi2():
    """Test Chi2 feature extraction"""
 
    chi = sklearn.feature_selection.chi2(X, y)
    print chi
 
    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    print chi2.get_support(indices=True), [0]
    print chi2.transform(X), np.array(X)[:, [0]]
 
    chi2 = mkchi2(k=2).fit(X, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]
 
    Xsp = csr_matrix(X, dtype=np.float)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]
    Xtrans = chi2.transform(Xsp)
    print Xtrans.shape, [Xsp.shape[0], 2]
 
    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_equal(Xtrans, Xtrans2)
Beispiel #5
0
def test_chi2():
    """Test Chi2 feature extraction"""

    chi = sklearn.feature_selection.chi2(X, y)
    print chi

    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    print chi2.get_support(indices=True), [0]
    print chi2.transform(X), np.array(X)[:, [0]]

    chi2 = mkchi2(k=2).fit(X, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]

    Xsp = csr_matrix(X, dtype=np.float)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]
    Xtrans = chi2.transform(Xsp)
    print Xtrans.shape, [Xsp.shape[0], 2]

    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_equal(Xtrans, Xtrans2)
Beispiel #6
0
### Splitting Data into Train and Test using a StratifiedShuffleSplit
sss = StratifiedShuffleSplit(Y, 10, test_size=0.3, random_state=0)

### Using the generated indices to create Test and Train datasets
for train_index, test_index in sss:
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]


### Select K best features based on a chi-squared test
feature_chi = 72    ## selecting best 2/3rd features
chi2 = SelectKBest(chi2, k=feature_chi)
X_train = chi2.fit_transform(X_train, y_train)
X_test  = chi2.transform(X_test)
#print(X_train)


### Defining a function to print statistics which helps us benchmark classifier performance
def benchmark(clf):
    clf_descr = str(clf).split('(')[0]  ## store name of the classifier
    print(clf_descr)                    ## print name
    t0 = time()                     ## store current time in t0
    clf.fit(X_train, y_train)       ## run classifier to fit data
    train_time = time() - t0        ## Calculate time take to train
    print("train time: %0.3fs" % train_time)    ## print statistic

    t0 = time()                     ## store current time in t0
    pred = clf.predict(X_test)      ## use trained classifer to predict class for test data
    test_time = time() - t0         ## Calculate time take to predict for test data
data_path = '/Users/zhangzhaopeng/统计学习/机器学习/Text_Classification/data_preprocessing.pkl'
fp = open(data_path, 'rb')
x_train, x_test, y_train, y_test = pickle.load(fp)
fp.close()

## 卡方检验选择特征
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
vectorizer = CountVectorizer(min_df=2)
x_train_tf = vectorizer.fit_transform(x_train)
x_test_tf = vectorizer.transform(x_test)
chi2 = SelectKBest(chi2, k=4000)
x_train_chi2 = chi2.fit_transform(x_train_tf, y_train)
x_test_chi2 = chi2.transform(x_test_tf)

## naive bayes
naive_chi2 = naive_bayes.MultinomialNB().fit(x_train_chi2, y_train)
naive_chi2_preds = naive_chi2.predict(x_test_chi2)
count_accu = 0
for i in range(len(y_test)):
    if y_test[i] == naive_chi2_preds[i]:
        count_accu += 1
naive_accu_chi2 = count_accu / len(y_test)
#naive_accu2 = metrics.accuracy_score(naive_preds, y_test)
print("Test set accuracy: ", naive_accu_chi2)
# confusion_matrix
conf_arr_naive_chi2 = [[0, 0], [0, 0]]
for i in range(len(y_test)):
    if y_test[i] == 0:
# SelectKBest based on Chi-squared scoring function to choose 10 best features. 
   
scalar = MinMaxScaler()
scaled_features = scalar.fit_transform(features)

#print scaled_features 

features_train, features_test, labels_train, labels_test = \
    train_test_split(scaled_features, labels, test_size=0.1, random_state=42)

# Manually tried several k values, Number of top features to select, for Chi-squared the k=10 was returning best 
# results for different methods and clasifiers. 

chi2 = SelectKBest(chi2, 10)
features_train = chi2.fit_transform(features_train, labels_train)
features_test = chi2.transform(features_test)

# keep selected feature names
# i+1 because we still have poi as the first name in the feature_list, while the actual features matrix does not

features_list_new = [features_list[i+1] for i in chi2.get_support(indices=True)]

features_list = ["poi"] + features_list_new
print "chi2 selected features_list = "
pprint (features_list)

# I will apply featureFormat to new feature_list with 10 best members and extraxt 
# new labels/features to use them for the same varity of clasifiers and compare their scores.

data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)
def baoxian():
    #outputfile =  u"E://项目需求//爬虫项目//和硕爬虫//tfidf//"
    outputfile = u"D://workspace//python//classify_WeChat//baoxian//bxtfidf"
    X_train,y_train = cPickle.load(open(os.path.join(outputfile,"train.data"),"rb"))
    X_test,y_test = cPickle.load(open(os.path.join(outputfile,"test.data"),"rb"))
    vectorizer = cPickle.load(open(os.path.join(outputfile,"vectorizer.data"),"rb"))
    chi2  = cPickle.load(open(os.path.join(outputfile,"ch2.data"),"rb"))
    clf = cPickle.load(open(os.path.join(outputfile,"SGD_l2.model"),"rb"))
    #inputpath =u"E://项目需求//JDPower//分类//4月份//financeoutput1_final.txt"
    #outputpath =u"E://项目需求//JDPower//分类//4月份//保险.txt"

    inputpath =u"D://workspace//python//classify_WeChat//data//financeoutput1_final//financeoutput1_final.txt"
    outputpath =u"D://workspace//python//classify_WeChat//data//financeoutput1_final//baoxian.txt"
    # inputpath =u"..//data//financeoutput1_final//financeoutput1_final.txt"
    # outputpath =u"..//data//financeoutput1_final//保险.txt"


    label = "保险"

    forbidkword = {}
    # load

    # forbidpath = u"..//keyword.txt"
    forbidpath = u"keyword.txt"
    with open(forbidpath, "rb") as f:
        for line in f:
            word = line.strip()
            forbidkword[word] = 0

    outfile = open(outputpath,"wb")
    with open(inputpath, "rb") as f:
        for line in f:
            splits = line.strip().split("\t")
            tag = splits[0]

            if tag.find(label) > -1 :
                print(tag)
                train = []
                #print (splits[-1])
                seg = jieba.cut(splits[-1], cut_all=False)
                #seglist = [i for i in seg]
                seglist = []
                for w in seg:
                    #print w
                    w = w.strip().encode("utf-8")
                    if w not in forbidkword:
                        if not re.match(r"\d+$", w):
                            seglist.append(w)
                train.append(" ".join(seglist))
                X_test = vectorizer.transform(train)
                X_test = chi2.transform(X_test)
                pred = clf.predict(X_test)
                #print(" ".join(pred))
                print (pred)
                lb = str(pred[0])
                #print(isinstance(lb, unicode))
                #print( lb.decode("gbk").encode("utf-8"))
                #outfile.writelines(lb+"\n")
                if lb == '1' :
                    outfile.writelines(line.strip()+"\t")
                    outfile.writelines(lb+"\n")
            #outfile.writelines(line.strip()+"\t"+lb.decode("utf-8").encode("utf-8")+"\n")
    outfile.close()