Esempio n. 1
0
def ex_feature(train_set,test_set,t_train,t_test,hash=False,use_tf=False,K=2000):
    '''
    extract feature from train_set,test_set,using term frequency or tf-idf.
    And a stop_word.txt is necessary
    :param train_set:numpy array or sparse matrix of shape [n_samples,n_features]
                    Training data
    :param test_set:numpy array or sparse matrix of shape [n_samples,n_features]
                    Training data
    :param t_train:numpy array of shape [n_samples, n_targets]
                    Target values
    :param t_test:numpy array of shape [n_samples, n_targets]
                    Target values
    :param hash: use HashingVectorizer
    :param use_tf: use term frequency to descend dimensions
    :param K:select k best features based on cki2,only used if ``use_tf == 'False'``
    :return:train_Set and test_set after extracting features
    '''
    with open('chinese_stopword.txt', 'r', encoding='utf-8-sig') as f:
        stop_words = list(f.read().splitlines())
        data_train_size_mb = size_mb(train_set)
        data_test_size_mb = size_mb(test_set)
        start_time = time.time()

        print('extracting features......')
        if hash:
            from sklearn.feature_extraction.text import HashingVectorizer
            vectorizer = HashingVectorizer(non_negative=True)
            x_train = vectorizer.fit_transform(train_set)
            x_test = vectorizer.fit_transform(test_set)
        else:
            tfidf_transformer = TfidfTransformer()
            if use_tf:
                vectorizer = CountVectorizer(max_features=K,stop_words=stop_words, decode_error='strict')
                x_train_tf_matrix = vectorizer.fit_transform(train_set)
                x_train = tfidf_transformer.fit_transform(x_train_tf_matrix)
                x_test_tf_matrix = vectorizer.transform(test_set)#共用一个vectorizer
                x_test = tfidf_transformer.fit_transform(x_test_tf_matrix)
            else:
                from sklearn.feature_selection import SelectKBest
                from sklearn.feature_selection import chi2
                from sklearn.feature_extraction.text import TfidfVectorizer
                vectorizer = TfidfVectorizer(stop_words=stop_words)
                x_train_tfidf_matrix = vectorizer.fit_transform(train_set)
                x_test_tfidf_matrix = vectorizer.transform(test_set)
                chi2=SelectKBest(chi2, k=K)
                x_train = chi2.fit_transform(x_train_tfidf_matrix, t_train)
                x_test = chi2.transform(x_test_tfidf_matrix)

        end_time=time.time()

        print('extract features took %.2f s  at %0.2fMB/S' % ( (time.time() - start_time), (data_train_size_mb+data_test_size_mb) / (end_time-start_time)))
        return x_train, x_test
Esempio n. 2
0

### Splitting Data into Train and Test using a StratifiedShuffleSplit
sss = StratifiedShuffleSplit(Y, 10, test_size=0.3, random_state=0)

### Using the generated indices to create Test and Train datasets
for train_index, test_index in sss:
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]


### Select K best features based on a chi-squared test
feature_chi = 72    ## selecting best 2/3rd features
chi2 = SelectKBest(chi2, k=feature_chi)
X_train = chi2.fit_transform(X_train, y_train)
X_test  = chi2.transform(X_test)
#print(X_train)


### Defining a function to print statistics which helps us benchmark classifier performance
def benchmark(clf):
    clf_descr = str(clf).split('(')[0]  ## store name of the classifier
    print(clf_descr)                    ## print name
    t0 = time()                     ## store current time in t0
    clf.fit(X_train, y_train)       ## run classifier to fit data
    train_time = time() - t0        ## Calculate time take to train
    print("train time: %0.3fs" % train_time)    ## print statistic

    t0 = time()                     ## store current time in t0
    pred = clf.predict(X_test)      ## use trained classifer to predict class for test data
## import data
data_path = '/Users/zhangzhaopeng/统计学习/机器学习/Text_Classification/data_preprocessing.pkl'
fp = open(data_path, 'rb')
x_train, x_test, y_train, y_test = pickle.load(fp)
fp.close()

## 卡方检验选择特征
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
vectorizer = CountVectorizer(min_df=2)
x_train_tf = vectorizer.fit_transform(x_train)
x_test_tf = vectorizer.transform(x_test)
chi2 = SelectKBest(chi2, k=4000)
x_train_chi2 = chi2.fit_transform(x_train_tf, y_train)
x_test_chi2 = chi2.transform(x_test_tf)

## naive bayes
naive_chi2 = naive_bayes.MultinomialNB().fit(x_train_chi2, y_train)
naive_chi2_preds = naive_chi2.predict(x_test_chi2)
count_accu = 0
for i in range(len(y_test)):
    if y_test[i] == naive_chi2_preds[i]:
        count_accu += 1
naive_accu_chi2 = count_accu / len(y_test)
#naive_accu2 = metrics.accuracy_score(naive_preds, y_test)
print("Test set accuracy: ", naive_accu_chi2)
# confusion_matrix
conf_arr_naive_chi2 = [[0, 0], [0, 0]]
for i in range(len(y_test)):
# I make further modifications on the features_list by applying MinMaxScaler and
# SelectKBest based on Chi-squared scoring function to choose 10 best features. 
   
scalar = MinMaxScaler()
scaled_features = scalar.fit_transform(features)

#print scaled_features 

features_train, features_test, labels_train, labels_test = \
    train_test_split(scaled_features, labels, test_size=0.1, random_state=42)

# Manually tried several k values, Number of top features to select, for Chi-squared the k=10 was returning best 
# results for different methods and clasifiers. 

chi2 = SelectKBest(chi2, 10)
features_train = chi2.fit_transform(features_train, labels_train)
features_test = chi2.transform(features_test)

# keep selected feature names
# i+1 because we still have poi as the first name in the feature_list, while the actual features matrix does not

features_list_new = [features_list[i+1] for i in chi2.get_support(indices=True)]

features_list = ["poi"] + features_list_new
print "chi2 selected features_list = "
pprint (features_list)

# I will apply featureFormat to new feature_list with 10 best members and extraxt 
# new labels/features to use them for the same varity of clasifiers and compare their scores.

data = featureFormat(my_dataset, features_list)
Esempio n. 5
0

# In[42]:

for i in range(0, len(x_feature_names)):
    print (i, x_feature_names[i])


# ## Feature Scaling and Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

print X.shape

chi2 = SelectKBest(chi2, k=20)
X_new = chi2.fit_transform(X, y)
print X_new.shapep_values_of_features_dict = {}
for i in range(0, len(x_feature_names)):
    p_values_of_features_dict[x_feature_names[i]] = chi2.pvalues_[i]
    
p_values_of_features_dict
import operator
sorted_p_values_of_features_dict = sorted(p_values_of_features_dict.items(), key=operator.itemgetter(1))sorted_p_values_of_features_dict
# In[43]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_new = scaler.fit_transform(X)
Esempio n. 6
0
def preprocess_dataset(path, remove_punctuation=True,
                       text_representation='tfidf', tfidf_max_features=None,
                       tfidf_min_df=7, tfidf_max_df=0.8,
                       feature_selection=None, labels='sentiment'):
    """
    Preprocess dataset and return features and labels
    Args:
        path: path string to trainset.txt
        remove_punctuation: remove punctuation flag, default True
        text_representation: representation of text, default tfidf
        tfidf_max_features: max_features for tfidf, default None
        tfidf_min_df: min_df for tfidf, default 7
        tfidf_max_df: max_df for tfidf, default 0.8
        feature_selection: type of feature_selection, default None
        labels: return type of labels, default sentiment
    Return:
        (features, y_labels): preprocessed features and labels ([contains
        binarized 'sentiment'] or [label encoding of 'topic' and label
        'topic_labels'])
    """

    reviews = trainset_to_df(path)

    # Removing punctuations
    if remove_punctuation:
        process_text = lambda review: ' '.join(review.translate(str.maketrans('', '', string.punctuation)).split())
        reviews['text'] = reviews['text'].apply(process_text)

    # Removing all stopwords
    stopword_list = stopwords.words('english')
    stopword_list += 'nt'
    reviews['text'] = reviews['text'].apply(lambda review: ' '.join([word for word in review.split() if word not in stopword_list]))

    # Drop id column
    reviews.drop('id', axis=1, inplace=True)

    # Binarize sentiment label
    reviews['sentiment'] = reviews['sentiment'].apply(lambda sentiment: 0 if sentiment == 'neg' else 1)

    # Label Encode topic label
    le = LabelEncoder()
    reviews['topic'] = le.fit_transform(reviews['topic'])
    reviews['topic_labels'] = le.inverse_transform(reviews['topic'])

    # Vectorize text with Tfid
    if text_representation == 'tfidf':

        vectorizer = TfidfVectorizer (max_features=tfidf_max_features, min_df=tfidf_min_df, max_df=tfidf_max_df)
        features = vectorizer.fit_transform(reviews['text']).toarray()

    # Get labels
    if labels == 'sentiment':
        y_labels = reviews['sentiment']
    else:
        y_labels = reviews[['topic', 'topic_labels']]

    # Feature selection
    if feature_selection == 'variance_threshold':

        from sklearn.feature_selection import VarianceThreshold
        var_thres = VarianceThreshold(threshold=np.var(features))
        features = var_thres.fit_transform(features)

    elif feature_selection == 'chi_square_test':

        from sklearn.feature_selection import SelectKBest, chi2
        chi2 = SelectKBest(chi2, k=features.shape[1])
        if labels == 'topic':
            features = chi2.fit_transform(features, y_labels.iloc[:, 0])
        else:
            features = chi2.fit_transform(features, y_labels)

    return (features, y_labels)