def MINE_selector(train_x, train_y, k=10): from sklearn.feature_selection import SelectKBest from minepy import MINE def mic(x, y): m = MINE() m.compute_score(x, y) return (m.mic(), 0.5) #选择K个最好的特征,返回特征选择后的数据 selection = SelectKBest( lambda X, Y: np.array(map(lambda x: mic(x, Y), X.T)).T, k) selection.fit(train_x, train_y) selection.scores_ = selection.scores_[0] print( '----------------------------feature importance -------------------------' ) importance = selection.scores_ print selection.scores_ print( '----------------------------- selected feature -------------------------' ) print selection.get_support(indices=True) return selection, importance
print "Training Naive Bayes model " # load index file y = population[:, 1] X = plpData[population[:, 0], :] print "population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1]) print "Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]) ########################################################################### print "Applying univariate feature selection to select %s features as naive bayes reqires non-sparse data " % ( featnum) kbest = SelectKBest(chi2, k=2000).fit( X[population[:, population.shape[1] - 1] > 0, :], y[population[:, population.shape[1] - 1] > 0]) kbest.scores_ = np.nan_to_num(kbest.scores_) print "Test kbest length: %s non-zero: %s" % (kbest.scores_.shape[0], np.sum(kbest.scores_ != 0)) threshold = -np.sort(-kbest.scores_)[featnum - 1] print "Threshold varImp set at %s" % (threshold) X = X[population[:, population.shape[1] - 1] > 0, :] print "Test X dim: %s , %s" % (X.shape[0], X.shape[1]) X = X[:, kbest.scores_ >= threshold] print "Test X dim: %s , %s" % (X.shape[0], X.shape[1]) X = X.toarray() y = y[population[:, population.shape[1] - 1] > 0] pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0)) print "Calculating prediction for train set of size %s" % (pred_size) test_pred = np.zeros( pred_size) # zeros length sum(population[:,population.size[1]] ==i)
# Create a scores array to get the individual categorical column. # Example: # data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical', # 'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States'] # scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # # Returns: [['Sate-gov']] scores = [] # Build the scores array for j in range(len(COLUMNS[:-1])): if i == j: # This column is the categorical column we want to extract. scores.append(1) # Set to 1 to select this column else: # Every other column should be ignored. scores.append(0) skb = SelectKBest(k=1) skb.scores_ = scores # Convert the categorical column to a numerical value lbn = LabelBinarizer() r = skb.transform(train_features) lbn.fit(r) # Create the pipeline to extract the categorical feature categorical_pipelines.append(('categorical-{}'.format(i), Pipeline([('SKB-{}'.format(i), skb), ('LBN-{}'.format(i), lbn)]))) # Create pipeline to extract the numerical features skb = SelectKBest(k=6) # From COLUMNS use the features that are numerical skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0] categorical_pipelines.append(('numerical', skb))
def train_naive_bayes(population, plpData, modelOutput, variableNumber, quiet): print("Training Naive Bayes model " ) y = population[:,1] X = plpData[population[:,0].astype(int),:] print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1])) print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1])) ########################################################################### featnum = min(variableNumber,X.shape[1]) print("Applying univariate feature selection to select %s features as naive bayes reqires non-sparse data " %(featnum)) kbest = SelectKBest(chi2, k=featnum).fit(X[population[:,population.shape[1]-1] > 0,:], y[population[:,population.shape[1]-1] > 0]) kbest.scores_ = np.nan_to_num(kbest.scores_) print("Test kbest length: %s non-zero: %s" %(kbest.scores_.shape[0], np.sum(kbest.scores_!=0))) threshold = -np.sort(-kbest.scores_)[featnum-1] print("Threshold varImp set at %s" %(threshold)) X = X[population[:,population.shape[1]-1] > 0, :] print("Test X dim: %s , %s" %(X.shape[0], X.shape[1]) ) X = X[:,kbest.scores_ >=threshold] print("Test X dim: %s , %s" %(X.shape[0], X.shape[1]) ) X = X.toarray() y = y[population[:,population.shape[1]-1] > 0] pred_size = int(np.sum(population[:,population.shape[1]-1] > 0)) print("Calculating prediction for train set of size %s" %(pred_size)) test_pred = np.zeros(pred_size)# zeros length sum(population[:,population.size[1]] ==i) for i in range(1, int(np.max(population[:,population.shape[1]-1])+1), 1): testInd = population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] ==i trainInd = (population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] !=i) train_x = X[trainInd,:] train_y = y[trainInd] test_x = X[testInd,:] print("Fold %s split %s in train set and %s in test set" %(i, train_x.shape[0], test_x.shape[0])) print("Train set contains %s outcomes " %(np.sum(train_y))) # train on full test data using hyper parameters print("Training fold %s" %(i)) start_time = timeit.default_timer() gnb = GaussianNB() gnb = gnb.fit(train_x, train_y) end_time = timeit.default_timer() print("Training fold took: %.2f s" %(end_time-start_time)) print("Calculating predictions on left out fold set...") ind = (population[:,population.shape[1]-1] > 0) ind = population[ind,population.shape[1]-1]==i test_pred[ind] = gnb.predict_proba(test_x)[:,1] print("Prediction complete: %s rows " %(np.shape(test_pred[ind])[0])) print("Mean: %s prediction value" %(np.mean(test_pred[ind]))) # cv pred test_pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1) predictioncv = np.append(population[population[:,population.shape[1]-1] > 0,:],test_pred, axis=1) # train final: print("Training final naive bayes model on all train data...") start_time = timeit.default_timer() gnb = GaussianNB() gnb = gnb.fit(X, y) end_time = timeit.default_timer() print("Training final took: %.2f s" %(end_time-start_time)) # save the model: if not os.path.exists(modelOutput): os.makedirs(modelOutput) print("Model saved to: %s" %(modelOutput) ) joblib.dump(gnb, os.path.join(modelOutput,"model.pkl")) # merge pred with indexes[testInd,:] test_pred = gnb.predict_proba(X)[:,1] test_pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1) prediction = np.append(population[population[:,population.shape[1]-1] > 0,:],test_pred, axis=1) return prediction, predictioncv, kbest.scores_;
# created by iterating over the COLUMNS and checking if it is a CATEGORICAL_COLUMN. for i, col in enumerate(COLUMNS[:-1]): if col in CATEGORICAL_COLUMNS: # Create a scores array to get the individual categorical column. # Example: # data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical', # 'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States'] # scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # # Returns: [['State-gov']] # Build the scores array. scores = [0] * len(COLUMNS[:-1]) # This column is the categorical column we want to extract. scores[i] = 1 skb = SelectKBest(k=1) skb.scores_ = scores # Convert the categorical column to a numerical value lbn = LabelBinarizer() r = skb.transform(train_features) lbn.fit(r) # Create the pipeline to extract the categorical feature categorical_pipelines.append( ('categorical-{}'.format(i), Pipeline([ ('SKB-{}'.format(i), skb), ('LBN-{}'.format(i), lbn)]))) # [END categorical-feature-conversion] # [START create-pipeline] # Create pipeline to extract the numerical features skb = SelectKBest(k=6) # From COLUMNS use the features that are numerical