コード例 #1
0
ファイル: featureSelect.py プロジェクト: saba96/droidcat
def cv(model, features, labels):
    global g_accuracy
    k = 10

    if g_accuracy:
        selector = RFECV(model, step=1, cv=k)
        selector = selector.fit(features, labels)
        score = selector.score(features, labels)
        return score, selector.n_features_, selector.ranking_

    selector_prec = RFECV(model, step=1, cv=k, scoring='precision_weighted')
    selector_prec.fit(features, labels)
    score_prec = selector_prec.score(features, labels)

    selector_rec = RFECV(model, step=1, cv=k, scoring='recall_weighted')
    selector_rec.fit(features, labels)
    score_rec = selector_rec.score(features, labels)

    selector_f1 = RFECV(model, step=1, cv=k, scoring='f1_weighted')
    selector_f1.fit(features, labels)
    score_f1 = selector_f1.score(features, labels)

    return (score_prec, selector_prec.n_features_, selector_prec.ranking_), \
           (score_rec, selector_rec.n_features_, selector_rec.ranking_), \
           (score_f1, selector_f1.n_features_, selector_f1.ranking_)
コード例 #2
0
def randomforest_rfecv(X, y, X_test, y_test, columns):
    estimator = RandomForestClassifier(**CLASSIFIER_PARAMS)
    selector = RFECV(estimator, step=1, cv=5, verbose=0)
    selector = selector.fit(X, y)
    # selector ranking to column:rank pairs
    rank = {columns[i]: s for i, s in enumerate(selector.ranking_)}
    # Feature importances
    importances = {
        columns[i]: v
        for i, v in enumerate(selector.estimator_.feature_importances_)
    }
    labeled = {
        str(k): v
        for k, v in sorted(importances.items(), key=lambda item: -item[1])
    }

    return {
        # sort rank by values
        'rank': {
            str(k): int(v)
            for k, v in sorted(rank.items(), key=lambda item: item[1])
        },
        # pick selected features names
        'support': [columns[i] for i, s in enumerate(selector.support_) if s],
        'feature_importances': labeled,
        'score': selector.score(X, y),
        'test_score': selector.score(X_test, y_test)
    }
コード例 #3
0
def decision_tree():
    print "---bc---"
    clf = tree.DecisionTreeClassifier(criterion="gini")

    rfecv = RFECV(clf, cv=10)

    _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth)
        _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini" + str(depth))

    clf = tree.DecisionTreeClassifier(criterion="entropy")
    _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth)
        _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy" + str(depth))

    rfecv.fit(bc_data_train, bc_target_train)
    print rfecv.support_
    print rfecv.ranking_
    print rfecv.score(bc_data_test, bc_target_test)
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()


    print "---v---"
    clf = tree.DecisionTreeClassifier(criterion="gini")

    rfecv = RFECV(clf, cv=10)

    _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth)
        _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini" + str(depth))

    clf = tree.DecisionTreeClassifier(criterion="entropy")
    _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth)
        _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy" + str(depth))

    rfecv.fit(v_data_train, v_target_train)
    print rfecv.support_
    print rfecv.ranking_
    print rfecv.score(v_data_test, v_target_test)
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #4
0
def main():
    train_df = munge_data('./data/train.csv', False)
    train_df = train_df.drop('PassengerId', axis=1)
    target_df = train_df['Survived']
    train_df = train_df.drop('Survived', axis=1)
    train_df = train_df.sort(axis=1)

    test_df = munge_data('./data/test.csv')
    test_ids = test_df.PassengerId.values
    test_df = test_df.drop('PassengerId', axis=1)
    test_df = test_df.sort(axis=1)

    train_data = train_df.values
    target_data = target_df.values
    test_data = test_df.values

    clf = svm.SVC(kernel='linear')
    selector = RFECV(clf, step=1, cv=5, scoring='accuracy')

    train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split(
        train_data, target_data, test_size=0.2)

    selector = selector.fit(train_data, target_data)

    print(selector.score(cx_data, cx_target_data))
    cx_predictions = selector.predict(cx_data)
    print(classification_report(cx_target_data, cx_predictions))
    predictions = selector.predict(test_data)

    with open('output.csv', 'w') as o:
        o.write('PassengerId,Survived\n')
        for passenger, prediction in zip(test_ids, predictions):
            o.write('{},{}\n'.format(passenger, prediction))
コード例 #5
0
def optimal_features(model, x_train, y_train, x_test, y_test):
    rfecv = RFECV(estimator=model,
                  step=1,
                  cv=StratifiedKFold(2),
                  scoring='accuracy')
    rfecv.fit(x_train, y_train)

    print(rfecv.score(x_train, y_train), rfecv.score(x_test, y_test))
    print("Optimal number of features : %d" % rfecv.n_features_)

    # # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #6
0
def featureSelectAndClassifyRFECV(X_train, X_test, y_train, y_test):

    scaler = MinMaxScaler()
    #scaler = StandardScaler()
    #scaler = RobustScaler()
    X_train_minmax = scaler.fit_transform(X_train)
    X_test_minmax = scaler.transform(X_test)

    #svc =svm.LinearSVC()
    rf = RandomForestClassifier(n_estimators=50, max_depth=20)

    rfecv = RFECV(estimator=rf,
                  step=1,
                  min_features_to_select=5,
                  cv=StratifiedKFold(5),
                  scoring='accuracy')

    X_train_transformed = rfecv.fit_transform(X_train_minmax, y_train)
    #X_train_transformed = rfecv.fit_transform(X_train, y_train)
    X_test_transformed = rfecv.transform(X_test_minmax)
    #X_test_transformed = rfecv.transform(X_test)
    score = rfecv.score(X_test_minmax, y_test)
    #score = rfecv.score(X_test, y_test)

    print('Optimal no. of features are ' + str(rfecv.n_features_))
    print('Score for test set is ' + str(score))
    print(rfecv.ranking_.shape)
    print(X_train_transformed.shape)
    print(X_test_transformed.shape)

    plt.figure()
    plt.xlabel('no. of features')
    plt.ylabel('cv score')
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #7
0
def FeatureSelectGreedy(df, model, in_columns, target, step=100):
    y = df[target]

    selector = RFECV(model, step=1, cv=3)

    keep_columns = list()
    N = len(in_columns)

    for i in range(0, N, step):
        j = min(i + step, N)
        print "\n--\nNumber of test features = %d(/%d)" % (j, N)

        X = df[keep_columns + in_columns[i:j]]

        start_time = timer(None)

        selector = selector.fit(X, y)

        timer(start_time)

        keep_columns = X.columns[selector.support_].tolist()
        score = selector.score(X, y)

        print "Number of keep features =", len(keep_columns)
        print "Score =", score

    return keep_columns
コード例 #8
0
def main():
    train_df = munge_data('./data/train.csv', False)
    train_df = train_df.drop('PassengerId', axis=1)
    target_df = train_df['Survived']
    train_df = train_df.drop('Survived', axis=1)
    train_df = train_df.sort(axis=1)

    test_df = munge_data('./data/test.csv')
    test_ids = test_df.PassengerId.values
    test_df = test_df.drop('PassengerId', axis=1)
    test_df = test_df.sort(axis=1)
    
    train_data = train_df.values
    target_data = target_df.values
    test_data = test_df.values

    clf = svm.SVC(kernel='linear')
    selector = RFECV(clf, step=1, cv=5, scoring='accuracy')
    
    train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split(
        train_data, target_data, test_size=0.2)

    selector = selector.fit(train_data, target_data)
    
    print(selector.score(cx_data, cx_target_data))
    cx_predictions = selector.predict(cx_data)
    print(classification_report(cx_target_data, cx_predictions))
    predictions = selector.predict(test_data)

    with open('output.csv', 'w') as o:
        o.write('PassengerId,Survived\n')
        for passenger, prediction in zip(test_ids, predictions):
            o.write('{},{}\n'.format(passenger, prediction))
コード例 #9
0
def test_model(model, xtrain, ytrain, feature_list, prefix):
    """ use train_test_split to create validation train/test samples """
    xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain,
                                                    test_size=0.4)

    if DO_RFECV:
        model.fit(xtrain, ytrain)
        if hasattr(model, 'coef_'):
            model = RFECV(estimator=model, verbose=0, step=1,
                          scoring=score_fn, cv=3)

    model.fit(xTrain, yTrain)
    print 'score', model.score(xTest, yTest)
    ypred = model.predict(xTest)
    ### don't allow model to predict negative number of orders
    if any(ypred < 0):
        print ypred[ypred < 0]
        ypred[ypred < 0] = 0

    print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest))

#    debug_output(model, feature_list)

    debug_plots(model, yTest, ypred, prefix)

    return
コード例 #10
0
def testModel(_model, _X, _Y):
    if _model == "LogisticRegression":
        model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    elif _model == "MLPClassifier":
        model = MLPClassifier()
    elif _model == "RandomForestClassifier":
        model = RandomForestClassifier()
    elif _model == "GradientBoostingClassifier":
        model = GradientBoostingClassifier()
    elif _model == "XGBClassifier":
        model = XGBClassifier()

    X_train, X_test, y_train, y_test = __splitData(_X, _Y)

    # Since XGBoost is not part of sklearn
    if _model == "XGBClassifier":
        model.fit(X_train, y_train.values.ravel())
        #y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Accuracy of ", _model, " classifier on test set: {:.2f}".format(
            accuracy))

    # For the sklearn stuff
    else:
        selector = RFECV(model)    # Use the RFE wrapper
        selector.fit(X_train, y_train.values.ravel())
        #y_pred = selector.predict(X_test)
        print("Accuracy of ", _model, " classifier on test set: {:.2f}".format(
            selector.score(X_test, y_test)))
コード例 #11
0
 def rfecv(self):
     rfecv = RFECV(estimator=SVC(kernel = "linear"), step=1, cv=StratifiedKFold(10),
           scoring='accuracy')
     rfecv.fit(self.train_X, self.train_y)
     print "Best number of features:" + str(rfecv.n_features_)
     print "Accuracy on test data:" + str(rfecv.score(self.test_X,self.test_y))
     print "RFECV feature ranking:" 
     print rfecv.ranking_
コード例 #12
0
def data_prediction():
    train, test = data_preprocessing()
    X = train.drop(columns=['gender'])
    y = train['gender']
    print('[INFO]....trainset shape: ', X.shape)
    print('[INFO]....testset shape: ', test.shape)
    encoding_columns = ['first_item_browsed']

    X, test = category_encoding(encoding_columns, 0.2, X, y, test)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=123)

    ##########################FOR BASE LGBM############################################
    '''model = lgb.LGBMClassifier()
    model.fit(X_train,y_train)
    print('score on validation data: ',model.score(X_test,y_test))
    final_pred = model.predict(test)'''

    ##########################FOR LGBM USING RFECV#########################################
    print('[INFO]....Creating an LGBM model')
    print('[INFO]....Applying RFECV to select 150 features')

    model = lgb.LGBMClassifier()
    model = RFECV(estimator=model,
                  step=10,
                  min_features_to_select=150,
                  scoring='accuracy')
    model.fit(X_train, y_train)

    X_train = model.transform(X_train)
    X_test = model.transform(X_test)
    test = model.transform(test)

    print('[INFO]....After tranformation train shape :', X_train.shape)

    model = lgb.LGBMClassifier()
    model.fit(X_train, y_train)
    print('score on validation data: ', model.score(X_test, y_test))
    final_pred = model.predict(test)

    ###########################FOR STACKING PURPOSE ############################################
    '''basemodel_1,basemodel_2,basemodel_3,meta_model = stacking_models(X_train,X_test,y_train,y_test)
    base_pred_test = np.column_stack((basemodel_1.predict_proba(test)[:,1],basemodel_2.predict_proba(test)[:,1],\
                                     basemodel_3.predict_proba(test)[:,1]))
    
    final_pred = meta_model.predict(base_pred_test)'''

    ###########################FOR NEURAL NETWORK PURPOSE#############################################
    #model = neural_net(X_train,y_train,X_train.shape[1])

    #pd.Series(dict(zip(X.columns.tolist(),model.feature_importances_))).sort_values(ascending=False).head(20).plot(kind='bar')
    return (final_pred)
コード例 #13
0
def train_classfier(X_train, y_train, X_test, y_test):
    svc = LinearSVC()
    clf = RFECV(svc, step=0.1, cv=7, n_jobs=-1)
    t = time.time()
    clf.fit(X_train, y_train)
    t2 = time.time()
    print(round(t2 - t, 2), 'Seconds to train SVC...')
    t = time.time()
    print('Test Accuracy of SVC = ', round(clf.score(X_test, y_test), 4))
    # Check the prediction time for a single sample
    print('time takes: ', time.time() - t)
    return clf
コード例 #14
0
class rfe_LBC(li_LBC):
    def fit(self, X, Y):
        params = self.get_params()
        model = li_LBC(**params)
        self.rfe = RFECV(model)
        self.rfe.fit(X, Y)

    def predict(self, X):
        return self.rfe.predict(X)

    def score(self, X, Y):
        return self.rfe.score(X, Y)
コード例 #15
0
def recursive_feature_elimination_cv(X_train, y_train, X_test, y_test):
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # classifications
    rfecv = RFECV(estimator=svc,
                  step=1,
                  cv=KFold(10),
                  scoring='accuracy',
                  n_jobs=-1)
    rfecv.fit(X_train, y_train)
    # Determine the accuracy of the SVC model on the test-data, get the used number of features and ranking of the importance of features
    accuracy = rfecv.score(X_test, y_test)
    RankFeatures = rfecv.ranking_
    Nfeatures = rfecv.n_features_
    return [rfecv, accuracy, Nfeatures, RankFeatures]
コード例 #16
0
def feature_selection_with_cv(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):

	#kernel: linear, poly, rbf, sigmoid, precomputed

	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]



	rotated = convert_list_to_matrix(features_values, rows, columns)
	scores = np.array(prediction_values)

	threshold = float(threshold)

	estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.)

	###############START: PLAYING AROUND WITH RECURSIVE FEATURE WITH CROSS VALIDATION FUNCTION.#####################
	#####Seems to be a bit different. RFE (without cross validation) allows us to choose a number of features we're ###
	#####looking for. It seems that cross valdiation chooses the optimal number? So no threshold? Not positive.#######



	 selector = RFECV(estimator, step=1, cv=5)
	 selector = selector.fit(rotated, scores)
	 selector.support_

	 print selector.support_
	 features_used = [i+1 for i, x in enumerate(selector.support_) if x == True] # i+1 b/c matlab starts indexing from 1

	 print features_used

	 features_used = []
	 threshold = selector.score(rotated, scores) ####perhaps if this is the "optimal # of features" we could use this value as
	 											#the RFE threshold value.
	 print "threshold: "
	 print threshold
コード例 #17
0
ファイル: stacking_boost.py プロジェクト: tiffen/Stacking
 def select_feature_wrapping(self, estimator, X, y, scoring):
     estimator_name = (self.get_default_params_and_name(estimator))[0]
     print("using recursive feature elimination to tune features: " +
           estimator_name)
     selector = RFECV(estimator, step=1, cv=3, scoring=scoring, verbose=2)
     selector = selector.fit(X, y)
     sn = selector.n_features_
     sc = selector.score(X, y)
     sr = selector.ranking_
     print("features number and score:", sn, sc)
     print("selected features ranking:", sr)
     with open("tf_log.csv", 'a', newline='') as f:
         writer = csv.writer(f)
         str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
         writer.writerow(
             ["feature selection with rfecv ", estimator_name, str_time])
         writer.writerow([
             "feature selection score: ", sc, "selected feature number:",
             sn, "feature ranking:"
         ])
         writer.writerow(sr)
     return {estimator_name: selector}
コード例 #18
0
ファイル: poi_id.py プロジェクト: lm-bsl/DAND_Intro2ML
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
print "RandomForestClassifier "
# RFECV:  Select the algorithm to train with:
clf_Ranking = RFECV(GradientBoostingClassifier(random_state=0,
                                               learning_rate=0.05,
                                               max_depth=1),
                    scoring='accuracy',
                    n_jobs=-1)
# RFECV: Fit and transform the RFECV function
clf_Ranking.fit_transform(features_train, labels_train)

print clf_Ranking.score(features_train, labels_train)
print clf_Ranking.ranking_
# result of feature selection : [ 1 13  4 14  1 12 11  8  1  9  5  6  1  2 10  7  3  1]
# [1 4 5 1 1 1 1 1 3 1 1 1 6 2 1 1 1 1]
# [14  5  1 11  1 10  4  1  1  1  6  3  2  9  8 12 13  7  1]
#print scores

# GBC : [13 12 11 10  3  1  1  9  1  1  1  8  1  7  6  2  4  5  1  1]

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# SECTION 4:  Classifier Selection
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
コード例 #19
0
data_USA['target'] = np.where(condition, 0 , 1)
data_USA_target = data_USA['target']
data_USA.drop(['num','id','target'],axis = 1, inplace = True)
data_USA = pd.get_dummies(data_USA, columns= ['cp','restecg','slope','thal','loc'])
data_std = Standardize(data_USA)
data_std['target'] = data_USA_target
print("Data preprocessed...")
data = data_std.as_matrix()
train_x, test_x, train_y, test_y = train_test_split(data[:, 0:-1], data[:,-1],train_size=0.75)
names = list(data_USA.columns.values)
print("Executing Recursive Feature Elimination in SVM...")
svc = SVC(kernel="linear", C=5)
rfecv  = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10),
              scoring='accuracy')
rfecv.fit(train_x, train_y)
Training_score = rfecv.score(train_x, train_y)
predicted= rfecv.predict(test_x)
accuracy = accuracy_score(test_y, predicted)
print("The support array \n",rfecv.support_)
print("The ranking array \n",rfecv.ranking_)
print(sorted(zip(map(lambda x: round(x, 4), rfecv.ranking_), names)))
print("Training Accuracy is ", Training_score)
print("Test Accuracy is ", accuracy)
print("The Cross-validation score :" ,max(rfecv.grid_scores_))
print("Optimal number of features : {}" .format(rfecv.n_features_))
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
コード例 #20
0
ファイル: Main.py プロジェクト: rattlesnailrick/MM-Tracking
def predictAndPlot(data, header, features, name):
    print "\n%s" % name

    # First reduce the data to relevant features.
    features_plus_date = np.hstack((0, features))
    analyzed_data = data[:, features_plus_date]

    # Remove rows with missing data.
    for i in range(len(analyzed_data[0])):
        analyzed_data = analyzed_data[analyzed_data[:, i] != '']

    # If it is a retention feature, skip the last X entries.
    if "retention" in name:
        if "1d" in name:
            retention_feature_linesSkipped = 3
        elif "3d" in name:
            retention_feature_linesSkipped = 7
        elif "7d" in name:
            retention_feature_linesSkipped = 15
        elif "14d" in name:
            retention_feature_linesSkipped = 29
        elif "28d" in name:
            retention_feature_linesSkipped = 57
        else:
            retention_feature_linesSkipped = 0
        analyzed_data = analyzed_data[:-retention_feature_linesSkipped, :]

        # The second-last line is # votes. If smaller than 50, skip this entry.
        # analyzed_data = analyzed_data[analyzed_data[:, -2].astype(float) >= min_daily_regs]

    # I added the date to simply for plotting reasons. Just in case. Could be removed if not needed.
    dates = analyzed_data[:, 0]

    # Set best model and best score default values.
    best_model = ""
    best_score = -100

    # Iterate through all models to obtain the best parameters and features via cross validation
    for model_type in list_of_models:
        # Get training data X and y.
        X = analyzed_data[:, 1:-1].astype(float) # Ignore dates (first column) and "y" (last column)
        y = analyzed_data[:, -1].astype(float)

        model = define_model(model_type) # Set model parameters based on model_type

        # Perform differently depending on which model is used.
        # Random Forest has to be treated differently because it doesn't support RFECV.
        if model_type == "RF":
            to_be_used_threshold = "median"  # Default value. Will be overwritten.
            score = -100.

            # Loop through different thresholds. Use the one with the highest score.
            for model_threshold in ("10.*median", "3.*median", "1*median", "0.3*median", "0.1*median", "0.03*median"):
                try:
                    # Use only the "model_threshold" best features.
                    model.fit(X, y)
                    X_new = model.transform(X, threshold=model_threshold)
                    header_new = model.transform(header[features][:-1], threshold=model_threshold)

                    # Fit the model again with reduced features X_new and return out of bag score.
                    model.fit(X_new, y)
                    rf_score = model.oob_score_

                    # I try to keep the amount of features as small as possible.
                    # The rf_score of a model with more features needs to be 2% better to justify more params.
                    # In some cases the score is negative so it also needs to be better overall.
                    if (rf_score > score * 1.02) and (rf_score > score):
                        score = rf_score
                        to_be_used_threshold = model_threshold
                except:
                    # Just a debug output.
                    print "There was an error at model threshold: %s" % model_threshold

            print "Score is %2.3f with threshold: %s" % (score, to_be_used_threshold)
        elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"):
            selector = RFECV(model)
            selector = selector.fit(X, y)
            header_new = header[features][:-1]
            score = selector.score(X, y)
            print "Score is %2.3f with model: %s" % (score, model_type)
        else:
            print "Something went wrong!"

        if score > best_score:
            best_score = score
            best_model = model_type

    print "Best score is %2.3f with model: %s" % (best_score, best_model)


    # Predict using the best model, parameters and features, obtained before.
    model_type = best_model
    model = define_model(model_type)

    if model_type == "RF":
        # In some rare cases the model does not work, because all features were discarded.
        # Therefore try to do it again without a threshold, that should always work (model_threshold).
        try:
            model.fit(X, y)
            X_new = model.transform(X, threshold = to_be_used_threshold)
            header_new = model.transform(header[features][:-1], threshold=to_be_used_threshold)

            model.fit(X_new, y)
            prediction = model.predict(X_new)
            score = model.oob_score_
        except:
            print "Fitting the model didn't work! The prediction might be sub-optimal. \nThreshold: %s" % model_threshold
            model.fit(X, y)
            prediction = model.predict(X)
            #score = model.oob_score_
            score = 0
    elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"):
        selector = RFECV(model)
        selector = selector.fit(X, y)
        header_new = header[features][:-1]
        prediction = selector.predict(X)
        score = selector.score(X, y)
    else:
        print "lol!"

    # Now derive the importances respectively feature coefficients.
    try:
        # This only works with "RF"
        importances = model.feature_importances_
        importances_list = np.vstack((importances, header_new))
        importances_list = np.transpose(importances_list)
        importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::-1]
    except:
        # This should work with all other models.
        try:
            X_new = selector.transform(X)
            header_new = selector.transform(header_new)
            model.fit(X_new, y)
            med_value = np.median(X_new, axis=0)
            med_value[med_value == 0] = np.mean(X_new, axis=0)[med_value == 0]
            importances = model.coef_ * np.median(X_new, axis=0)
            importances_list = np.vstack((importances, header_new))
            importances_list = np.transpose(importances_list)
            importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::1]
        except:
            # If the above doesnt work, just give a blank output.
            importances_list = np.zeros((10, 2))

    score = "%s, %s\nOOB Score = %2.2f" % (name, model_type, score)

    plot_predictionVsActual(prediction, y, score)
    return prediction, y, dates, importances_list
コード例 #21
0
# In[ ]:

######################################

# In[ ]:

# Automagic to fınd the optımal number of features for some algorithm

rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=100),
              step=1,
              cv=StratifiedKFold(train_y, 2),
              scoring='accuracy')
rfecv.fit(train_X, train_y)

print(rfecv.score(train_X, train_y), rfecv.score(val_X, val_y))
print("Optimal number of features : %d" % rfecv.n_features_)

#Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

# In[ ]:

clf = RandomForestClassifier(n_estimators=100)
##Train the selected model
clf.fit(train_X, train_y)
コード例 #22
0
def main():
    train = pd.read_csv("../input/train.csv")  #change filepath later
    test = pd.read_csv('../input/train.csv')  #change filepath later

    full = train.append(testc, ignoer_index=True)
    titanic = full[:891]

    del train, test
    print('Datasets:', 'full:', full_shape, 'titanic:', titanic.shape)

    #Peak at data to see what it looks like
    titanic.head()
    titanic.describe()

    #Plot correlation heat map
    plot_correlation_map(titanic)

    #Plot distribution of Age of passengers
    plot_distribution(titanic, var='Age', target='Survived', row='Sex')

    #Plot distribution of Fare of passengers
    plot_distribution(titanic, var='Fare', target='Survived', row='Pclass')

    #Plot survival rate by embarked
    plot_categories(titanic, cat='Embarked', target='Survived')

    #Plot survival rate by Sex
    plot_categories(titanic, cat='Sex', target='Survived')

    #Plot survival rate by Pclass
    plot_categories(titanic, cat='Pclass', target='Survived')

    #Plot surivival rate by SibSp
    plot_categories(titanic, cat='SibSp', target='Survived')

    #Plot survival rate by Parch
    plot_categories(titanic, cat='Parch', target='Survived')

    #Make sex into binary values 0 & 1 (needs to be numerical data)
    sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name='Sex')

    #Create new variable for every unique embarked variable
    embarked = pd, get_dummies(full.Embarked, prefix='Embarked')
    embarked.head()

    #Create new variable for every unique value of Passenger Class
    pclass = pd.get_dummies(full.Pclass, prefix='Pclass')
    pclass.head()

    #Replace 2 missing embarkation values with the port closest to fare value

    imputed.head()

    #Extracting title
    title = pd.DataFrame()
    title['Title'] = full['Name'].map(
        lambda name: name.split(',')[1].split('.')[0].strip())
    Title_Dictionary = {
        "Capt": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Jonkheer": "Royalty",
        "Don": "Royalty",
        "Sir": "Royalty",
        "Dr": "Officer",
        "Rev": "Officer",
        "the Countess": "Royalty",
        "Dona": "Royalty",
        "Mme": "Mrs",
        "Mlle": "Miss",
        "Ms": "Mrs",
        "Mr": "Mr",
        "Mrs": "Mrs",
        "Miss": "Miss",
        "Master": "Royalty",
        "Lady": "Royalty"
    }
    title['Title'] = title.Title.map(Title_Dictionary)
    title = pd.get_dummies(title.Title)
    #title pd.concat([title, titles_dummies], axis = 1)
    title.head()

    #Replace 1 missing fare value with the median
    full['Fare'] = full.Fare.fillna(full.Fare.median())

    #Fill missing values of Age
    #Option 1: fill with the average of Age
    #Age['Age'] = full.Age.fillna(full.Age.mean())

    #Option 2: use regression analysis to find likely value of age for missing values
    #will need to get rid of negative ages and other stupid values

    #Option 3: fill missing ages with medians that are seperated by group
    stuff['Title'] = title.Title
    stuff['Sex'] = sex
    stuff['Pclass'] = pclass
    stuff['Age'] = full.Age
    stuff['Age'] = stuff.groupby(
        ['Sex', 'Pclass',
         'Title'])['Age'].transform(lambda x: x.fillna(x.median()))

    Age['Age'] = stuff.Age

    del stuff

    #Fill in missing cabin values
    #Use regression from Pclass, ticket, embarkation port, etc...
    cabin = pd.DataFrame()

    #Create family size variable
    family = pd.DataFrame()

    family['FamilySize'] = full['Parch'] + full['Sibsip'] + 1

    #Single, small or large family
    family['Family_Single'] = family['FamilySize'].map(lambda s: 1
                                                       if s == 1 else 0)
    family['Family_Small'] = family['FamilySize'].map(lambda s: 1
                                                      if 2 <= s <= 4 else 0)
    family['Family_Large'] = family['FamilySize'].map(lambda s: 1
                                                      if 5 <= s else 0)

    family.head()

    #Create a wealth variable
    wealth = pd.DataFrame()
    money = pd.DataFrame()
    money['Pclass'] = full['Pclass']
    money['Title'] = title['Title']
    money['Fare'] = full['Fare']
    cabin['Cabin'] = full['Cabin']

    wealth['Social_Class']

    #Create Functions to define if Poor, Middle Class or Rich
    wealth['Poor'] = wealth['Social_Class'].map(determine_Poor(money))
    wealth['Middle_Class'] = wealth['Social Class'].map(
        determine_Middle(money))
    wealth['Rich'] = wealth['Social Class'].map(determine_Rich(money))

    full_X = pd.concat([Age, embarked, cabin, sex, wealth, family], axis=1)
    full_X.head()

    #Create all datasets neccessary to test models
    train_valid_X = full_X[0:891]
    train_valid_Y = titanic.Survived
    test_X = full_X[891:]
    train_X, valid_X, train_Y, valid_Y = train_test_split(train_valid_X,
                                                          train_valid_Y,
                                                          train_size=0.7)
    print(full_X.shape, train_X.shape, valid_X.shaoe, train_Y.shape,
          valid_Y.shape, test_X.shape)

    plot_variable_importance(train_X, train_Y)

    #Run several different models
    model1 = RandomForestClassifier(n_estimators=100)
    model2 = SVC()
    model3 = GradientBoostingClassifier(n_neighbors=3)
    model4 = GaussianNB()
    model5 = LogisticRegression()

    model1.fit(train_X, train_Y)
    model2.fit(train_X, train_Y)
    model3.fit(train_X, train_Y)
    model4.fit(train_X, train_Y)
    model5.fit(train_X, train_Y)

    train_score1 = model1.score(train_X, train_Y)
    train_score2 = model2.score(train_X, train_Y)
    train_score3 = model3.score(train_X, train_Y)
    train_score4 = model4.score(train_X, train_Y)
    train_score5 = model5.score(train_X, train_Y)

    valid_score1 = model1.score(valid_X, valid_Y)
    valid_score2 = model2.score(valid_X, valid_Y)
    valid_score3 = model3.score(valid_X, valid_Y)
    valid_score4 = model4.score(valid_X, valid_Y)
    valid_score5 = model5.score(valid_X, valid_Y)

    #Print out score comparisons
    print("Train Data Score: Validation Data Score:")
    print(train_score1, valid_score1)
    print(train_score2, valid_score2)
    print(train_score3, valid_score3)
    print(train_score4, valid_score4)
    print(train_score5, valid_score5)

    #Hopefully find the Optimal Features for the model
    plot_model_var_imp(model1, train_X, train_Y)
    rfecv = RFECV(estimator=model1,
                  step=1,
                  cv=StratifiedKFold(train_Y, 2),
                  scoring='accuracy')
    rfecv.fit(train_X, train_Y)
    print(rfecv.score(train_X, train_Y), rfecv.score(valid_X, Valid_Y))
    print("Optimal number of features: %d" % refecv.n_features_)

    #Plot number of features vs. cross Validcation Scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classification")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores)
    plt.show()
コード例 #23
0
                        rownames=['True'],
                        colnames=['Predicted'],
                        margins=True)
print("confusion matrix")
print(confu_mat)
print("parameters")
statLogitModel = sm.Logit(train_target, pured_data).fit_regularized()
print(statLogitModel.params)
print("P-values")
scores, pvalues = chi2(pured_data, train_target)
for i in range(len(pvalues)):
    print(pured_data.columns[i], pvalues[i])
plt.figure(figsize=(16, 9))
plt.plot(falsePositiveRate, truePositiveRate)
plt.plot([0, 1], [0, 1], linestyle='dotted')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC and AUC')
plt.show()

test_data = feature.loc[feature['train_test'] == 0]
test_data.loc[(test_data['activities'] == 0) | (test_data['activities'] == 1),
              'activities'] = 0
test_data.loc[(test_data['activities'] != 0) & (test_data['activities'] != 1),
              'activities'] = 1
test_target = test_data.iloc[:, -1]
test_data = test_data[pured_data.columns]
accuracy = clf.score(test_data, test_target)

print("accuracy: %.2f" % accuracy)
コード例 #24
0
ファイル: RFE_score.py プロジェクト: joinudit/DollarCrunchers
clf = linear_model.Ridge(alpha =30)
#clf=linear_model.LinearRegression()
rfecv = RFECV(estimator=clf, step=1, cv=KFold(5,shuffle=True)
             )

rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)
for i in np.array(features)[rfecv.support_]:
    print i 

pred_train = rfecv.predict(X_train) 
pred_test = rfecv.predict(X_test)

print ("Train score :%.2f" %rfecv.score(X_train,y_train))
print ("validation score :%.2f" %rfecv.score(X_test,y_test))

sorted_features=[]
sorted_scores=sorted(rfecv.ranking_)
for i in np.argsort(rfecv.ranking_):
    sorted_features.append(features[i])
sorted_scores=np.array(sorted_scores)
sorted_scores=6-sorted_scores
# plot feature scores
pos1 = range(len(sorted_features),len(sorted_features[:len(sorted_features) - 11]), -1) 
pos2 = range(len(sorted_features[:len(sorted_features) - 11]),0, -1)  
pos =  range(len(sorted_features),0, -1)
barh(pos1, sorted_scores[:11] , align='center', color='green')
barh(pos2, sorted_scores[11:] , align='center', color='red')
yticks(pos, sorted_features)
コード例 #25
0
X.head()
X.columns
XDF.columns
XDF.groupby('redirect')['n_count'].mean()
get_ipython().run_line_magic('matplotlib', '')
import seaborn as sns
X.var(0)
XDF.groupby('related_page')['n_count'].mean()
XX = X[[c for c in X if not c.startswith('M_')]]
XX = XX[[c for c in XX if not c.startswith('S_')]]
XX.columns
XX.var(0).plot(kind='bar')
XX.drop(['lifetime'], axis=1).var(0).plot(kind='bar')
rfecv.fit(XX, y)
rfecv.grid_scores_.max()
rfecv.score(XX, y)
lasso
lasso.fit(XX, y)
lasso.score(XX, y)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
XX.head()
get_ipython().run_line_magic('pinfo', 'SelectKBest')
XX.shape
for i in range(2, 20):
    best = SelectKBest(f_regression, k=i)
    XXX = best.fit_transform(XX.values)
    lr = lm.LinearRegression().fit(XXX, y.values)
    print(i, lr.score(XXX, y.values))

for i in range(2, 20):
コード例 #26
0
# print test_X.shape, test_Y.shape

logistic_reg = LogisticRegression()
logistic_reg.fit(train_X, train_Y)
print logistic_reg.score(test_X_1, test_Y_1)
# test_Y = logistic_reg.predict(test_X)

# result.to_csv('result.csv', encoding='utf-8', index=False)
Svc = SVC()
Svc.fit(train_X, train_Y)
print Svc.score(test_X_1, test_Y_1)
# test_Y = Svc.predict(test_X)

model = RandomForestClassifier(n_estimators=100)
model.fit(train_X, train_Y)
print model.score(test_X_1, test_Y_1)
# test_Y = model.predict(test_X)

rfecv = RFECV(estimator=model,
              step=1,
              cv=StratifiedKFold(train_Y, 2),
              scoring='accuracy')
rfecv.fit(train_X, train_Y)
print rfecv.score(test_X_1, test_Y_1)
test_Y = rfecv.predict(test_X)

passenger_id = full[891:].PassengerId
test = pd.DataFrame({'PassengerId': passenger_id, 'Survived': test_Y})
print test.shape
test.to_csv('pred.csv', index=False)
コード例 #27
0
ファイル: PipeTasks.py プロジェクト: Sandy4321/ProFET
    if (FeatSelection_RFE or FeatSelection_RFECV) == True:
        'RFE + - best feats'
        'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
        svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
        # svc = LogisticRegression(class_weight='auto')#,C=1)
        if FeatSelection_RFECV==True:
            rfecv = RFECV(estimator=svc, step=0.1,
                         cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.33),
                         scoring='f1',verbose=0)
            # " scoring='roc_auc','recall','f1'..."
        else:
            rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.1)
        rfecv.fit(X, y)
        if FeatSelection_RFECV==True:
            print("RFEcv selected %d number of Optimal features : " % (rfecv.n_features_))
        print("RFE (%d Features) scorer : \n" % (rfecv.n_features_),rfecv.score(X, y) )
        print("RFE selected feature names:")
        featureNames=featureNames[rfecv.get_support()]
        rfe_featnames = featureNames[rfecv.get_support()]
        print (rfe_featnames)
        X_RFE = rfecv.fit_transform(X, y)
        print(X_RFE.shape,"X_RFE \n")

        'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False


    print("\n X: \n")
    ModelParam_GridSearch(X,y,cv=4)

    if GetRFEPerf==True:
コード例 #28
0
ファイル: OutPutRes.py プロジェクト: MichaelDoron/ProFET
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')
コード例 #29
0
ファイル: models.py プロジェクト: sidharthgurbani/RCC_Proj
    def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None):
        # Store the original feature list and normalize the data
        list_temp = self.feature_list
        scaler = StandardScaler()
        X_minmax = scaler.fit_transform(X)
        self.X_minmax = copy.deepcopy(X_minmax)
        self.scores = []

        # Determine the number of folds to be used.
        kfold = StratifiedKFold(n_splits=5, shuffle=True)

        for outer in range(self.outer_loop):
            print("\n--------This is outer loop {}---------\n".format(outer + 1))
            # Run the outer loop from here
            for i, (train_o, test_o) in enumerate(kfold.split(X_minmax, y)):
                self.loop_indices.append((train_o, test_o))
                print("This is set {}".format(i + 1))
                X_train_o = X_minmax[train_o]
                y_train_o = y[train_o]
                X_test_o = X_minmax[test_o]
                y_test_o = y[test_o]
                X_train_transformed = copy.deepcopy(X_train_o)
                X_test_transformed = copy.deepcopy(X_test_o)

                # Run the inner loop from here
                for inner in range(self.inner_loop):
                    # If the number of features are very high (>100), we set the minimum number of features needed to be 100.
                    # If the numnber of features are moderate (15-100), we set the minimum number of features to be 10
                    # less than already present
                    n_feat = min(100, X_train_transformed.shape[1] - 10)

                    # If the number of features are less (<15), then we want it to select atleast 5 features to continue the loop
                    n_feat = max(10, n_feat)
                    list_temp_prev = list_temp
                    print("\n\t--------This is inner loop {}---------\n".format(inner + 1))
                    rfecv = RFECV(estimator=self.clf, step=1, min_features_to_select=n_feat, cv=kfold, scoring='accuracy')
                    # rfecv = xgb.XGBClassifier()

                    # Transform the datasets at each loop to keep track of reduced features
                    # rfecv.fit(X_train_transformed, y_train_o)
                    # X_train_transformed = rfecv.transform(X_train_transformed)
                    X_train_transformed = rfecv.fit_transform(X_train_transformed, y_train_o)
                    self.models.append(rfecv)
                    X_test_transformed = rfecv.transform(X_test_transformed)
                    X_minmax = rfecv.transform(X_minmax)
                    features = rfecv.n_features_
                    print("\tShape of transformed train dataset is: {}".format(X_train_transformed.shape))
                    print("\tOptimal no. of features are: {}".format(features))
                    ranking = rfecv.ranking_

                    # Update the feature list here
                    list_temp = self.updateFeatures(list_temp_prev, ranking)

                # This is just used to check the score after inner loop is finished as the test data was already transformed
                # to reduced features. Hence we inverse the transform to check the score
                X_temp = rfecv.inverse_transform(X_test_transformed)
                score = rfecv.score(X_temp, y_test_o)
                self.scores.append(score)
                print("Shape of transformed train dataset is: {}".format(X_train_transformed.shape))
                print("Shape of ranks is: {}\n\n".format(ranking.shape))

        # Print the average scores after finshing the outer loop and save the features in an excel file
        print("After outer loop CV, mean score is: {}".format(mean(self.scores)))
        self.list = list_temp_prev
        self.ranking = ranking
        print(X_train_transformed.shape)
        print(X_test_transformed.shape)
        self.X_transformed = np.vstack((X_train_transformed, X_test_transformed))

        return self
コード例 #30
0
ファイル: temp.py プロジェクト: racharyaUC/TempProj
X_selected = X_perc.transform(X)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfeLoR = RFE(LogisticRegression(solver='saga', max_iter=1000), 100)
#Sag model works well on large datasets but is sensitive to feature scaling. saga handles sparcity
rfeLoR.fit(X, Y)
rfeLoR.n_features_

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

m_RFERFC = RFECV(RandomForestClassifier(n_estimators=100), scoring='accuracy')
m_RFERFC.fit(X, Y)  # returns model
X_RFERFC = m_RFERFC.predict(X)
m_RFERFC.score(X, Y)

from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
m_lasso = SelectFromModel(LassoCV())
m_lasso.fit(X, Y)
m_lasso.transform(X).shape
X_lasso = m_lasso.transform(X)
m_lasso.get_params()
mask = m_lasso.get_support()
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
X.columns[mask]
#Using CV helps reduce selection bias due to the observations in the training set

#X_test_selected = modelfit.transform(X_test)
コード例 #31
0
rfecv.fit(trainData, trainLabel)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plotting features with cross validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()


# After an hour, the SVM model has been trained optimizing the features in the database. Using only these features
# will reduce the time of training of the model so used only 373 features instead of input of 561. 


print('Accuracy of the SVM model on test data is ', rfecv.score(testData,testLabel) )
# Getting the best features
best_features = []
for ix,val in enumerate(rfecv.support_):
    if val==True:
        best_features.append(testData[:,ix])


#The above yields an accuracy of approximately 97%. Following helps in visualization.
from pandas.tools.plotting import scatter_matrix
visualize = pd.DataFrame(np.asarray(best_features).T)
print(visualize.shape)
scatter_matrix(visualize.iloc[:,0:5], alpha=0.2, figsize=(6, 6), diagonal='kde')
コード例 #32
0
df['SexN']=df['Sex']
df1['SexN']=df1['Sex']


enc=LabelEncoder()



df['SexN']=enc.fit_transform(df['Sex']) 
df1['SexN']=enc.fit_transform(df1['Sex'])



X_train=df[['Pclass','SibSp','Parch','Fare','AgeN','SexN']] 
y_train=df['Survived']
X_test=df1[['Pclass','SibSp','Parch','Fare','AgeN','SexN']]
X_test1=df1[['PassengerId','Pclass','SibSp','Parch','Fare','AgeN','SexN']]
svc=SVC(kernel='linear')
#svc=DecisionTreeClassifier(criterion='entropy')
rfecv=RFECV(estimator=svc, step=1, cv=StratifiedKFold(y_train, 5),scoring='accuracy')
rfecv.fit(X_train,y_train)
predictions=rfecv.predict(X_test)
print rfecv.score(X_train,y_train)
print("Optimal number of features : %d" % rfecv.n_features_)

finlist=zip(X_test1['PassengerId'],predictions)
with open("/Users/prakashchandraprasad/Desktop/datasets/Titanic/Decision_tree_titanic7.csv","wb") as f:
    writer=csv.writer(f)
    writer.writerow(["PassengerId","Survived"])
    writer.writerows(finlist)
コード例 #33
0
# #### making the test train data set

# In[37]:

X = train.iloc[:, :73]
Y = train.iloc[:, -1:]

# #### cross validation and calculate the train and test score.

# In[46]:

cv = KFold(n_splits=5, random_state=None, shuffle=True)
scores = []
for (train1, test1), i in zip(cv.split(X, Y), range(5)):
    rfe.fit(X.iloc[train1], Y.iloc[train1])
    train_score = rfe.score(X.iloc[train1], Y.iloc[train1])
    test_score = rfe.score(X.iloc[test1], Y.iloc[test1])
    scores.append((train_score, test_score))

pd.DataFrame(scores, columns=['Train', 'Test'])

# In[47]:

print('Optimal number of features:',
      rfe.n_features_)  ## printing the optimal feature after RFE

# #### plotting the AUC ROC graph to see the model score

# In[48]:

import matplotlib.pyplot as plt
# In[91]:

selector.n_features_

# Which features were retained?

# In[92]:

X_train.columns[selector.support_]

# Score of the underlying LinearSVC on the training set:

# In[93]:

selector.score(X_train, y_train)

# Hopefully there was not too much overfitting.

# Reduce our data to the retained features:

# In[94]:

X_train = X_train.loc[:, selector.support_]
X_test = X_test.loc[:, selector.support_]

# # 4 Predictive Modeling
# <a id='4'></a>

# In[95]:
コード例 #35
0
def ExecuteRFECV(samples, y, featureNames, clusters, clusterNames, clf, kFolds,
                 nSplits, standardization, removedInfo, permutation,
                 nPermutation, currentDateTime, resultDir, debug, verbose):

    rfecv = RFECV(estimator=clf,
                  cv=StratifiedKFold(kFolds),
                  scoring='accuracy',
                  n_jobs=-1)

    # Create empty Pandas dataframe
    cvResults = pandas.DataFrame()
    decodingAccuracy = pandas.DataFrame()
    permResults = pandas.DataFrame()
    avg_perm_DA = []
    # Execute feature selection for nbOfSplit times
    for it in list(range(nSplits)):
        # Randomly create stratified train and test partitions (1/3 - 2/3)
        xTrain, xTest, yTrain, yTest = tts(samples,
                                           y['Cluster'],
                                           test_size=0.33,
                                           stratify=y['Cluster'])
        # Data z-score standardization
        xTrainSet, zPrm = Standardize(xTrain, yTrain, standardization, debug)

        # "accuracy" is proportional to the number of correct classifications
        if verbose:
            print('  Fiting for split #{}'.format(it))
        rfecv.fit(xTrainSet, yTrain)

        # Append the dataframe with the new cross-validation results.
        cvResults['cv_Scores_' + str(it)] = rfecv.grid_scores_
        cvResults['cv_Features_Rank_' + str(it)] = rfecv.ranking_

        if debug:
            print('cvResults for it %d' % it)
            print(cvResults)

        # Plot number of features VS. cross-validation scores
        fig_cv = plt.figure(dpi=300)
        plt.subplot(211)
        plt.title('Best performance = %.2f with %d features' % \
                  (max(rfecv.grid_scores_), rfecv.n_features_))
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross-validation score %")
        plt.plot(range(len(rfecv.grid_scores_)), rfecv.grid_scores_)

        # subplot selected features
        plt.subplot(212)
        plt.title('Features selection')
        plt.xlabel("Features")
        plt.xticks(range(len(rfecv.grid_scores_)),
                   featureNames,
                   rotation='vertical')
        plt.ylabel("Selection")
        plt.scatter(range(len(rfecv.grid_scores_)), rfecv.support_)
        plt.grid()
        plt.tight_layout()
        savedPlotName = resultDir+'RFECV'+'_CV_DA_'+clusters+'_'+str(it+1)+ \
                        '_'+str(nSplits)+'.png'
        plt.savefig(savedPlotName, bbox_inches='tight')
        plt.close(fig_cv)

        if verbose:
            print('\tComplete')

# ********************************** TEST *************************************
# standardize test set using trainset standardization parameters
        xTestSet = ApplyStandardization(xTest, zPrm)

        if verbose:
            print('  Testing')
        # use score() function to calculate DAs
        if debug:
            print('scores' + str(it))
            print(rfecv.score(xTestSet, yTest))
        decodingAccuracy['test_DA_' + str(it)] = [rfecv.score(xTestSet, yTest)]

        # plot confusion matrix
        y_pred = rfecv.predict(xTestSet)
        cm = confusion_matrix(yTest, y_pred)
        fig_CM = plt.figure(dpi=300)
        plot_confusion_matrix(cm, clusterNames, normalize=True, precision=2)
        savedPlotName = resultDir+'RFECV'+'_'+clusters+'_ConfusionMatrix_'+ \
                        str(it+1)+'_'+str(nSplits)+'.png'
        plt.savefig(savedPlotName, bbox_inches='tight')
        plt.close(fig_CM)

        if it == nSplits - 1:
            print('\nTest Decoding accuracy')
            decodingAccuracy['test_Avg_DA'] = decodingAccuracy.iloc[0][:].mean(
            )
            for i in list(range(len(decodingAccuracy.iloc[0]))):
                print('\t'+str(decodingAccuracy.iloc[0].index[i])+'\t'+ \
                      str(decodingAccuracy.iloc[0][i]))

            #formating test results to save in excel file
            fTest = []
            for i in range(len(list(decodingAccuracy)) - 1):
                fTest.append(decodingAccuracy.iloc[0][i])

            testDA = pandas.DataFrame()
            testDA['test_DA_per_epoch'] = fTest
            tmp = pandas.DataFrame(data=[np.mean(testDA['test_DA_per_epoch'])],
                                   columns=['avg_test_DA'])

            testDA = pandas.concat([testDA, tmp], axis=1)
            print('\tComplete\n')

# ****************************** Permutation **********************************
        if permutation:
            if verbose:
                print('  Permutting')
            # Create subset based on selected best features
            xTrain_rfecv = rfecv.transform(xTrainSet)
            xTest_rfecv = rfecv.transform(xTestSet)
            permResults['permutation_DA_' + str(it)] = Permute(clusters,
                                                               xTrain_rfecv,
                                                               xTest_rfecv,
                                                               yTrain,
                                                               yTest,
                                                               nPermutation,
                                                               debug_flag=0)
            avg_perm_DA.append(
                np.mean(permResults['permutation_DA_' + str(it)]))

#            savedHistName = resultDir+'/Permutation_hist_'+str(it)+'.png'
#            PlotPermHist(permResults,testDA.iloc[0][1],
#                         currentDateTime,savedHistName)
    if permutation:
        # compute permutation DA average and keep results in a dataframe
        epochedPermDA = ComputePermutationAvgDA(avg_perm_DA)

        print('Average permutation DA per train epoch')
        for i in epochedPermDA['Avg_Permutation_DA_per_epoch']:
            print('\t' + str(i))

        print('\nAverage permutation DA : {}'.format(
            epochedPermDA['Global_Permutation_DA'][0]))

        savedHistName = resultDir + 'Average_Permutation_hist.png'
        PlotPermHist(permResults, testDA.iloc[0][1], currentDateTime,
                     savedHistName)
        # formating permutation results to save in excel file
        permResults = pandas.concat([permResults, epochedPermDA], axis=1)


# ************************ Select best of best features ***********************
    ranks = cvResults.iloc[:, 1::2]
    if debug:
        print(ranks)

    bestFeatures = pandas.DataFrame()
    bestFeatures = ranks[(ranks == 1).all(1)].index.tolist()
    print('\nBest features :')
    tmp = []
    for i in bestFeatures:
        tmp.append(featureNames[i])
        print('\t' + featureNames[i])
    bestFeaturesNames = pandas.DataFrame(data=tmp, columns=['Best_Features'])

    # Calculate number of time every features is selected as best
    bestFeaturesHist = ranks[(ranks == 1)].sum(axis=1)
    bestFeaturesHist.rename('Best_Features_Hist')

    # Build structure of histogram data to save in excel
    hist = pandas.DataFrame(data=featureNames, columns=['Features_Name'])
    hist['Occurence_Best'] = bestFeaturesHist
    nbSubject = pandas.DataFrame(data=[len(samples)],
                                 columns=['Number_Of_Subjects'])
    nbFeature = pandas.DataFrame(data=[samples.shape[1]],
                                 columns=['Number_Of_Features'])
    dataSize = pandas.concat([nbSubject, nbFeature], axis=1)

    # Get the best test DA and corresponding training set of features
    bestDA = testDA['test_DA_per_epoch'].max()
    bestDAepoch = testDA['test_DA_per_epoch'].idxmax()
    colName = 'cv_Features_Rank_' + str(bestDAepoch)
    bTrainFeat = cvResults[colName][(cvResults[colName] == 1)].index.tolist()
    tmp = []
    tmp.append(bestDA)
    for i in bTrainFeat:
        tmp.append(featureNames[i])
    bTrainFeatName = pandas.DataFrame(data=tmp,
                                      columns=['Best_Train_Features_Set'])

    # Build results structure to be save in excel file
    excelResults = pandas.concat([
        cvResults, testDA, permResults, hist, bestFeaturesNames, removedInfo,
        dataSize, bTrainFeatName
    ],
                                 axis=1)
    #    excelResults.to_excel(resultDir+'results_RFECV_'+currentDateTime+'.xlsx',
    #                          sheet_name=xlSheetName)

    return excelResults
コード例 #36
0
print featureNames
print(model.feature_importances_)

importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

print 'Variable importance obtained'

# estimator for rfecv
est = linear_model.LogisticRegression()

# fit rfecv
rfecv = RFECV(estimator=est,
              step=1,
              cv=StratifiedKFold(y, 10),
              scoring='accuracy')
rfecv.fit(X, y)

print rfecv.support_
print rfecv.ranking_
print 'score = ', rfecv.score(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)
コード例 #37
0
                          cv=StratifiedShuffleSplit(y,
                                                    n_iter=7,
                                                    test_size=0.33),
                          scoring='f1',
                          verbose=0)
            # " scoring='roc_auc','recall','f1'..."
        else:
            rfecv = RFE(estimator=svc,
                        n_features_to_select=RFE_FeatsToKeep,
                        step=0.1)
        rfecv.fit(X, y)
        if FeatSelection_RFECV == True:
            print("RFEcv selected %d number of Optimal features : " %
                  (rfecv.n_features_))
        print("RFE (%d Features) scorer : \n" % (rfecv.n_features_),
              rfecv.score(X, y))
        print("RFE selected feature names:")
        featureNames = featureNames[rfecv.get_support()]
        rfe_featnames = featureNames[rfecv.get_support()]
        print(rfe_featnames)
        X_RFE = rfecv.fit_transform(X, y)
        print(X_RFE.shape, "X_RFE \n")

        'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf = False

    print("\n X: \n")
    ModelParam_GridSearch(X, y, cv=4)

    if GetRFEPerf == True:
        print("\n X-RFE: \n")
コード例 #38
0
ファイル: selectfeatures.py プロジェクト: chezhia/PySci
    Rclf.fit(Xtrain,ytrain);
    print("Residual sum of squares: %.2f"
         % np.mean((Rclf.predict(Xtest) - ytest) ** 2))
    print('Regularization choosen, alpha = %.2f' % Rclf.alpha_);
    print(' Coef values = ', Rclf.coef_);                                      
    print('Variance score: %.2f' % Rclf.score(Xtest, ytest))


selector = RFECV(rf, step = 1, cv=ShuffleSplit(len(X), 10, .2))
selector = selector.fit(X,y);
print (selector.n_features_)
for i,j in enumerate(selector.support_):
    if j == True:
        print(features[i])

print('Variance score Train: %.2f' % selector.score(X,y));
print('Variance score Test: %.2f' % selector.score(Xtest,ytest));
#print('Coeff of Test: ', selector.coef_);
print('No of Features selected by RFECV = %.2f' %sum(selector.support_));
plotfit(selector,Xtest,ytest);

# Learning Curve
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes= list(range(3,23,3))):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
コード例 #39
0
# print(sub_title)


svc = svm.LinearSVC()

# clf = tree.DecisionTreeClassifier()

rfecv = RFECV(svc, step=1, cv=StratifiedKFold(target, 2), scoring='accuracy')
rfecv.fit(selected,target)

rf_support = rfecv.support_

# print(rf_support)

sub2_title = []
for i in range(len(rf_support)):
	if (rf_support[i]):
		sub2_title.append(sub_title[i])
print sub2_title
# 
# print(array)
print(rfecv.score(selected,target))


print("Optimal number of features : %d" % rfecv.n_features_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()