Example #1
0
print(le_sex.classes_)
titanic_train['Sex'] = le_sex.transform(titanic_train['Sex'])

le_pclass = preprocessing.LabelEncoder()
le_pclass.fit(titanic_train['Pclass'])
print(le_pclass.classes_)
titanic_train['Pclass'] = le_pclass.transform(titanic_train['Pclass'])

features = ['Pclass', 'Parch' , 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex']
X_train = titanic_train[features]
y_train = titanic_train['Survived']

dt_estimator = tree.DecisionTreeClassifier()
ada_estimator = ensemble.AdaBoostClassifier(base_estimator=dt_estimator)
ada_grid = {'n_estimators':[10, 50, 100, 200], 'base_estimator__max_depth':[3,4,5,6,7], 'learning_rate':[0.1, 0.5, 1] }
ada_grid_estimator = model_selection.GridSearchCV(ada_estimator, ada_grid, cv=10, return_train_score=True)
ada_grid_estimator.fit(X_train, y_train)

print(ada_grid_estimator.best_score_)
print(ada_grid_estimator.best_params_)
final_estimator = ada_grid_estimator.best_estimator_
final_estimator.score(X_train, y_train)
print(final_estimator.estimators_)

# try Adaboost for Knn model
 
#read test data
titanic_test = pd.read_csv("Titanic_test.csv")
print(titanic_test.info())

titanic_test[imputable_cont_features] = cont_imputer.transform(titanic_test[imputable_cont_features])
Example #2
0
train_updtd, test_updtd = fileSplit(all_houses_onehot, train.shape[0])

y_train=train_updtd['SalePrice']
filterFeatures(train_updtd, ['SalePrice','log_sale_price'])
X_train=train_updtd
X_train.info()


def rmse(y_orig, y_pred):
    return math.sqrt(metrics.mean_squared_error(y_orig,y_pred))

gbm_estimator = ensemble.GradientBoostingRegressor(random_state=2017)
gbm_grid = {'learning_rate':[0.1,0.3,0.5,0.7,0.9], 'n_estimators':[50,100],
            'max_features':[12,13,14,15,16,17,18,19,20]}
grid_gbm_estimator = model_selection.GridSearchCV(gbm_estimator,gbm_grid,scoring=metrics.make_scorer(rmse),cv=10,n_jobs=1)
grid_gbm_estimator.fit(X_train, y_train)
print(grid_gbm_estimator.grid_scores_)
print(grid_gbm_estimator.best_params_)
print(grid_gbm_estimator.best_score_)
print(grid_gbm_estimator.score(X_train, y_train))
estimator = grid_gbm_estimator.best_estimator_

##################Final Prections Preparation

#total_missing_test = test_updtd.isnull().sum()
#n_test = test.shape[0]
#to_delete_test = total_missing_test[(total_missing_test/n_test) > 0 ]
#missingDataFeaturestes_test = list(to_delete_test.index)
#test_updtd.info()
Example #3
0
                         axis=1,
                         inplace=False)
#See how may columns are there after 3 additional columns, one hot encoding and dropping
titanic2.shape

X_train = titanic2[0:titanic_train.shape[0]]
X_train.shape
X_train.info()
y_train = titanic_train['Survived']

#Let's build the model
#If we don't use random_state parameter, system can pick different values each time and we may get slight difference in accuracy each time you run.
tree_estimator = tree.DecisionTreeClassifier(random_state=2017)
dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(3, 10))}
grid_tree_estimator = model_selection.GridSearchCV(tree_estimator,
                                                   dt_grid,
                                                   cv=10)
grid_tree_estimator.fit(X_train, y_train)
print(grid_tree_estimator.best_score_)  #Best score
print(grid_tree_estimator.best_params_)
print(grid_tree_estimator.score(X_train, y_train))

dt_grid2 = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(6, 10))}
grid_tree_estimator2 = model_selection.GridSearchCV(tree_estimator,
                                                    dt_grid2,
                                                    cv=8)
grid_tree_estimator2.fit(X_train, y_train)
print(grid_tree_estimator2.best_score_)  #Best score
print(grid_tree_estimator2.best_params_)
print(grid_tree_estimator2.score(X_train, y_train))
ohe.fit(titanic_train[ohe_features])
print(ohe.n_values_)
tmp1 = ohe.transform(titanic_train[ohe_features]).toarray()

features = ['Age', 'Fare', 'Parch', 'SibSp', 'FamilySize']
tmp2 = titanic_train[features].values

X_train = np.concatenate((tmp1, tmp2), axis=1)
y_train = titanic_train['Survived']

#create an estimator
dt_estimator = tree.DecisionTreeClassifier(random_state=100)
dt_grid = {'max_depth': [3, 4, 5, 6, 7], 'criterion': ['entropy', 'gini']}
dt_grid_estimator = model_selection.GridSearchCV(dt_estimator,
                                                 dt_grid,
                                                 scoring='accuracy',
                                                 cv=10,
                                                 refit=True)
dt_grid_estimator.fit(X_train, y_train)

#explore the results of grid_search_cv estimator
print(dt_grid_estimator.cv_results_)
print(dt_grid_estimator.best_estimator_)
print(dt_grid_estimator.best_score_)
print(dt_grid_estimator.best_params_)

#visualuze the final model built with best parameters in grid
best_dt_estimator = dt_grid_estimator.best_estimator_
print(best_dt_estimator.score(X_train, y_train))

#read test data
Example #5
0
        'nthread': [-1]
    }
})

# param_grids.update({
#     'mlp':
#     {'solver':['lbfgs'], 'alpha':[1e-5], 'hidden_layer_sizes':[(15,)], 'random_state':[1] }
# })

#  完成超参数网格搜索后的模型
model_grids = {}
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
for name, param in param_grids.items():
    model_grids[name] = model_selection.GridSearchCV(models[name],
                                                     param,
                                                     n_jobs=-1,
                                                     cv=kfold,
                                                     verbose=1,
                                                     scoring='f1')
    # model_grids[name] = models[name]


def read_data():
    data1 = pd.read_csv(os.path.join(data_root, '基础数据.csv'), encoding='GB2312')
    data2 = pd.read_csv(os.path.join(data_root, '年数据.csv'), encoding='GB2312')
    # print(data2)
    # reader3 = pd.read_table(
    #     os.path.join(data_root, '日数据.csv'),
    #     encoding='GB2312',
    #     sep=',',
    #     iterator=True)
    # chunks = []
voting_estimator = ensemble.VotingClassifier(estimators=[('dt', dt_estimator),
                                                         ('rf', rf_estimator),
                                                         ('ada', ada_estimator)
                                                         ],
                                             voting='soft',
                                             weights=[4, 4, 5])
voting_grid = {
    'dt__max_depth': [3, 5, 7],
    'rf__n_estimators': [20, 30],
    'rf__max_features': [7, 8],
    'rf__max_depth': [7, 8, 9],
    'ada__n_estimators': [50]
}
voting_grid_estimator = model_selection.GridSearchCV(voting_estimator,
                                                     voting_grid,
                                                     cv=10,
                                                     n_jobs=5)
voting_grid_estimator.fit(X_train, y_train)

print(voting_grid_estimator.grid_scores_)
print(voting_grid_estimator.best_score_)
print(voting_grid_estimator.best_params_)
print(voting_grid_estimator.score(X_train, y_train))

x_test = train3[train.shape[0]:]
x_test.shape

test['type'] = voting_grid_estimator.predict(x_test)

test.to_csv("Submission.csv", columns=['id', 'type'], index=False)
Example #7
0
          #svm.SVC(kernel='rbf', gamma=0.7),
          #svm.SVC(kernel='poly', degree=3)      
          ]
param_grid = [ 
    {   
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
        #'classify__gamma': gamma_range
    }  
]
reducer_labels = ['PCA', 'NMF']
classifier_labels = ['SVClinear']
#classifier_labels = ['SVClinear', 'SVCrbf', 'SVCpoly']
#classifier_labels = ['SVClinear', 'LinearSVC', 'SVCrbf', 'SVCpoly']

grid = model_selection.GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid)
grid.fit(X_train, y_train)
joblib.dump(grid, 'grid.set3.pkl')


grid = joblib.load('grid.set3.pkl')
y_predictions = grid.predict(X_test)

report = metrics.classification_report( y_test, y_predictions )
print(report)

print("Best parameters set found on development set:")
print()
print(grid.best_params_)

print("With a Best Score of:")
# I use a Power Tuned Random Forest Classifier where the tuned parameter is 'n_estimators'.
# For making this, I will create a vector "x" for trying different parameters in 'n_estimators'.

# In[ ]:

x = []
for i in range(100):
    x.append(i + 1)

# I set the model and the parameters's model to optimize with GridSearchCV

# In[ ]:

model = RandomForestClassifier(oob_score=True)
parameters = {'n_estimators': x}
power_tuning = model_selection.GridSearchCV(model, parameters)
model.fit(data_train, label_train)

# In[ ]:

model_tuned = power_tuning.fit(data_train, label_train.Survived)

# In[ ]:

model_tuned.best_estimator_

# In[ ]:

print("The best parameter for 'n_estimator' is:",
      model_tuned.best_estimator_.n_estimators)
Example #9
0
os.chdir("E:/")

titanic_train = pd.read_csv("train.csv")

#EDA
titanic_train.shape
titanic_train.info()

titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked'])
titanic_train1.shape
titanic_train1.info()
titanic_train1.head(6)

X_train = titanic_train1.drop(['PassengerId','Age','Cabin','Ticket', 'Name','Survived'], 1)
y_train = titanic_train['Survived']

#automate model tuning process. use grid search method
dt = tree.DecisionTreeClassifier()
param_grid = {'criterion':['entropy'],'max_depth':[3,4,5,6,7,8,9,10], 'min_samples_split':[7,8,9,10,11,12]}
dt_grid = model_selection.GridSearchCV(dt, param_grid, cv=10, n_jobs=5)
dt_grid.fit(X_train, y_train)
dt_grid.grid_scores_
final_model = dt_grid.best_estimator_
dt_grid.best_score_
dt_grid.score(X_train, y_train)


dot_data = io.StringIO() 
tree.export_graphviz(final_model, out_file = dot_data, feature_names = X_train.columns)
graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] 
graph.write_pdf("decisiont-tree-tuned1.pdf")
Example #10
0
def basic_results(clf,
                  classes,
                  training_x,
                  training_y,
                  test_x,
                  test_y,
                  params,
                  clf_type=None,
                  dataset=None,
                  dataset_readable_name=None,
                  balanced_dataset=False,
                  best_params=None,
                  seed=55,
                  threads=1):
    logger.info("Computing basic results for {} ({} thread(s))".format(
        clf_type, threads))

    if clf_type is None or dataset is None:
        raise Exception('clf_type and dataset are required')
    if seed is not None:
        np.random.seed(seed)

    curr_scorer = scorer
    if not balanced_dataset:
        curr_scorer = f1_scorer

    if best_params:
        clf.fit(training_x, training_y)
        test_score = clf.score(test_x, test_y)
        cv = clf
    else:
        cv = ms.GridSearchCV(clf,
                             n_jobs=threads,
                             param_grid=params,
                             refit=True,
                             verbose=10,
                             cv=5,
                             scoring=curr_scorer)
        cv.fit(training_x, training_y)
        reg_table = pd.DataFrame(cv.cv_results_)
        reg_table.to_csv('{}/{}_{}_reg.csv'.format(OUTPUT_DIRECTORY, clf_type,
                                                   dataset),
                         index=False)
        test_score = cv.score(test_x, test_y)

        # TODO: Ensure this is an estimator that can handle this?
        best_estimator = cv.best_estimator_.fit(training_x, training_y)
        final_estimator = best_estimator._final_estimator
        grid_best_params = pd.DataFrame([final_estimator.get_params()])
        grid_best_params.to_csv('{}/{}_{}_best_params.csv'.format(
            OUTPUT_DIRECTORY, clf_type, dataset),
                                index=False)
        logger.info(" - Grid search complete")

        final_estimator.write_visualization('{}/images/{}_{}_LC'.format(
            OUTPUT_DIRECTORY, clf_type, dataset))

        test_y_predicted = cv.predict(test_x)
        cnf_matrix = confusion_matrix(test_y, test_y_predicted)
        np.set_printoptions(precision=2)
        plt = plot_confusion_matrix(cnf_matrix,
                                    classes,
                                    title='Confusion Matrix: {} - {}'.format(
                                        clf_type, dataset_readable_name))
        plt.savefig('{}/images/{}_{}_CM.png'.format(OUTPUT_DIRECTORY, clf_type,
                                                    dataset),
                    format='png',
                    dpi=150,
                    bbox_inches='tight')

        plt = plot_confusion_matrix(
            cnf_matrix,
            classes,
            normalize=True,
            title='Normalized Confusion Matrix: {} - {}'.format(
                clf_type, dataset_readable_name))
        plt.savefig('{}/images/{}_{}_NCM.png'.format(OUTPUT_DIRECTORY,
                                                     clf_type, dataset),
                    format='png',
                    dpi=150,
                    bbox_inches='tight')

        logger.info(" - Visualization complete")

        with open('{}/test results.csv'.format(OUTPUT_DIRECTORY), 'a') as f:
            ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
            f.write('"{}",{},{},{},"{}"\n'.format(ts, clf_type, dataset,
                                                  test_score, cv.best_params_))

    n = training_y.shape[0]

    train_sizes = np.append(np.linspace(0.05, 0.1, 20, endpoint=False),
                            np.linspace(0.1, 1, 20, endpoint=True))
    logger.info(" - n: {}, train_sizes: {}".format(n, train_sizes))
    train_sizes, train_scores, test_scores = ms.learning_curve(
        clf if best_params is not None else cv.best_estimator_,
        training_x,
        training_y,
        cv=5,
        train_sizes=train_sizes,
        verbose=10,
        scoring=curr_scorer,
        n_jobs=threads,
        random_state=seed)
    logger.info(" - n: {}, train_sizes: {}".format(n, train_sizes))
    curve_train_scores = pd.DataFrame(index=train_sizes, data=train_scores)
    curve_test_scores = pd.DataFrame(index=train_sizes, data=test_scores)

    curve_train_scores.to_csv('{}/{}_{}_LC_train.csv'.format(
        OUTPUT_DIRECTORY, clf_type, dataset))
    curve_test_scores.to_csv('{}/{}_{}_LC_test.csv'.format(
        OUTPUT_DIRECTORY, clf_type, dataset))
    plt = plot_learning_curve(
        'Learning Curve: {} - {}'.format(clf_type, dataset_readable_name),
        train_sizes, train_scores, test_scores)
    plt.savefig('{}/images/{}_{}_LC.png'.format(OUTPUT_DIRECTORY, clf_type,
                                                dataset),
                format='png',
                dpi=150)
    logger.info(" - Learning curve complete")

    return cv
Example #11
0
ddf1.sample(10)

X=ddf1[["SEX","p1","p2","p3","agecat"]]
y=ddf1["dpay"]
#####grid search
X.info()
y.info()
depth=list(range(3,11))
grid = {'max_depth': np.arange(3, 10),
             'criterion' : ['gini','entropy'],
             'max_leaf_nodes': [5,10,20,100],
             'min_samples_split': [2, 5, 10, 20]}

model=tree.DecisionTreeClassifier()
gmodel=model_selection.GridSearchCV(model,grid,scoring="recall")
gmodel.fit(X,y)
best = gmodel.best_estimator_

best.fit(X,y)
print(dict(zip(X.columns, best.feature_importances_)))



X=ddf1[["SEX","edu","mar","agecat","p1","p2","p3","p4","p5","p6"]]
Y=ddf1["dpay"]

model =linear_model.LogisticRegression()
rfe = feature_selection.RFE(best,15)
fit = rfe.fit(X, y)
print("Num Features: %d",fit.n_features_)
Example #12
0
def iteration_lc(clf,
                 training_x,
                 training_y,
                 test_x,
                 test_y,
                 params,
                 clf_type=None,
                 dataset=None,
                 dataset_readable_name=None,
                 balanced_dataset=False,
                 x_scale='linear',
                 seed=55,
                 threads=1):
    logger.info(
        "Building iteration learning curve for params {} ({} threads)".format(
            params, threads))

    if clf_type is None or dataset is None:
        raise Exception('clf_type and dataset are required')
    if seed is not None:
        np.random.seed(seed)

    curr_scorer = scorer
    acc_method = balanced_accuracy
    if not balanced_dataset:
        curr_scorer = f1_scorer
        acc_method = f1_accuracy

    cv = ms.GridSearchCV(clf,
                         n_jobs=threads,
                         param_grid=params,
                         refit=True,
                         verbose=10,
                         cv=5,
                         scoring=curr_scorer)
    cv.fit(training_x, training_y)
    reg_table = pd.DataFrame(cv.cv_results_)
    reg_table.to_csv('{}/ITER_base_{}_{}.csv'.format(OUTPUT_DIRECTORY,
                                                     clf_type, dataset),
                     index=False)
    d = defaultdict(list)
    name = list(params.keys())[0]
    for value in list(params.values())[0]:
        d['param_{}'.format(name)].append(value)
        clf.set_params(**{name: value})
        clf.fit(training_x, training_y)
        pred = clf.predict(training_x)
        d['train acc'].append(acc_method(training_y, pred))
        clf.fit(training_x, training_y)
        pred = clf.predict(test_x)
        d['test acc'].append(acc_method(test_y, pred))
        logger.info(' - {}'.format(value))
    d = pd.DataFrame(d)
    d.to_csv('{}/ITERtestSET_{}_{}.csv'.format(OUTPUT_DIRECTORY, clf_type,
                                               dataset),
             index=False)
    plt = plot_learning_curve('{} - {} ({})'.format(clf_type,
                                                    dataset_readable_name,
                                                    name),
                              d['param_{}'.format(name)],
                              d['train acc'],
                              d['test acc'],
                              multiple_runs=False,
                              x_scale=x_scale,
                              x_label='Value')
    plt.savefig('{}/images/{}_{}_ITER_LC.png'.format(OUTPUT_DIRECTORY,
                                                     clf_type, dataset),
                format='png',
                dpi=150)

    logger.info(" - Iteration learning curve complete")

    return cv
Example #13
0
    'feature': X_train.columns,
    'importance': rf.feature_importances_
})
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
#How to display plot:
#Go to Menu Bar-->Tools-->Preferences-->IPython Console-->Graphics Tab-->Graphics backend-->Change Backend to Automatic
features.plot(kind='barh', figsize=(30, 30))

#threshold means, how  many number of features to be selected
#prefit means, the modeal was alredy fit
X_train.shape
fs = feature_selection.SelectFromModel(rf, threshold='median', prefit=True)
X_train1 = fs.transform(X_train)
X_train1.shape
type(X_train1)

#build model using selected features
bagged_tree_estimator = ensemble.RandomForestClassifier(random_state=100,
                                                        oob_score=True)
bagged_tree_grid = {'n_estimators': list(range(10, 11, 10))}
grid_bagged_tree_estimator = model_selection.GridSearchCV(
    bagged_tree_estimator, bagged_tree_grid, cv=10)
grid_bagged_tree_estimator.fit(X_train1, y_train)

final_model = grid_bagged_tree_estimator.best_estimator_
#print(final_model.oob_score_)
print(grid_bagged_tree_estimator.best_score_)
print(grid_bagged_tree_estimator.score(X_train1, y_train))
#May be over-fitted model
Example #14
0
def basic_results(clf,
                  classes,
                  training_x,
                  training_y,
                  test_x,
                  test_y,
                  params,
                  clf_type=None,
                  dataset=None,
                  dataset_readable_name=None,
                  seed=55,
                  threads=1):
    print("Computing basic results ({} thread(s))".format(threads))

    if clf_type is None or dataset is None:
        raise Exception('clf_type and dataset are required')
    if seed is not None:
        np.random.seed(seed)

    cv = ms.GridSearchCV(clf,
                         n_jobs=threads,
                         param_grid=params,
                         refit=True,
                         verbose=10,
                         cv=5,
                         scoring=scorer)
    cv.fit(training_x, training_y)
    reg_table = pd.DataFrame(cv.cv_results_)
    reg_table.to_csv('{}/{}_{}_reg.csv'.format(OUTPUT_DIRECTORY, clf_type,
                                               dataset),
                     index=False)
    test_score = cv.score(test_x, test_y)

    # TODO: Ensure this is an estimator that can handle this?
    best_estimator = cv.best_estimator_.fit(training_x, training_y)
    final_estimator = best_estimator._final_estimator
    best_params = pd.DataFrame([final_estimator.get_params()])
    best_params.to_csv('{}/{}_{}_best_params.csv'.format(
        OUTPUT_DIRECTORY, clf_type, dataset),
                       index=False)
    print(" - Grid search complete")

    final_estimator.write_visualization('{}/images/{}_{}_LC'.format(
        OUTPUT_DIRECTORY, clf_type, dataset))

    test_y_predicted = cv.predict(test_x)
    cnf_matrix = confusion_matrix(test_y, test_y_predicted)
    np.set_printoptions(precision=2)
    plt = plot_confusion_matrix(cnf_matrix,
                                classes,
                                title='Confusion Matrix: {} - {}'.format(
                                    clf_type, dataset_readable_name))
    plt.savefig('{}/images/{}_{}_CM.png'.format(OUTPUT_DIRECTORY, clf_type,
                                                dataset),
                format='png',
                dpi=150,
                bbox_inches='tight')

    plt = plot_confusion_matrix(
        cnf_matrix,
        classes,
        normalize=True,
        title='Normalized Confusion Matrix: {} - {}'.format(
            clf_type, dataset_readable_name))
    plt.savefig('{}/images/{}_{}_NCM.png'.format(OUTPUT_DIRECTORY, clf_type,
                                                 dataset),
                format='png',
                dpi=150,
                bbox_inches='tight')

    print(" - Visualization complete")

    with open('{}/test results.csv'.format(OUTPUT_DIRECTORY), 'a') as f:
        f.write('{},{},{},{}\n'.format(clf_type, dataset, test_score,
                                       cv.best_params_))
    n = training_y.shape[0]
    # TODO: Is the range here dependent on the dataset?
    train_sizes = list(
        map(
            int,
            list(
                np.geomspace(max(n * 0.05, 50),
                             n * 0.79,
                             num=20,
                             endpoint=True))))
    train_sizes, train_scores, test_scores = ms.learning_curve(
        cv.best_estimator_,
        training_x,
        training_y,
        cv=5,
        train_sizes=train_sizes,
        verbose=10,
        scoring=scorer,
        n_jobs=threads)
    curve_train_scores = pd.DataFrame(index=train_sizes, data=train_scores)
    curve_test_scores = pd.DataFrame(index=train_sizes, data=test_scores)

    curve_train_scores.to_csv('{}/{}_{}_LC_train.csv'.format(
        OUTPUT_DIRECTORY, clf_type, dataset))
    curve_test_scores.to_csv('{}/{}_{}_LC_test.csv'.format(
        OUTPUT_DIRECTORY, clf_type, dataset))
    plt = plot_learning_curve(
        'Learning Curve: {} - {}'.format(clf_type, dataset_readable_name),
        train_sizes, train_scores, test_scores)
    plt.savefig('{}/images/{}_{}_LC.png'.format(OUTPUT_DIRECTORY, clf_type,
                                                dataset),
                format='png',
                dpi=150)
    print(" - Learning curve complete")

    return cv
            event = spec.flatten()
        data.append(event)
        target.append(datafile[key].attrs['label'])
data = np.array(data)
target = np.array(target)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    data, target, test_size=.3)

begin = time.time()
print('Training...')
spectrogram = int(sys.argv[3])
if grid_search:
    param_grid = {'C': [.1, 1, 10, 100], 'gamma': [1, .1, .01, .0001]}
    grid = model_selection.GridSearchCV(svm.SVC(),
                                        param_grid,
                                        refit=True,
                                        cv=5,
                                        iid=False)
    grid.fit(X_train, y_train)
    print(grid.best_estimator_)
else:
    clf = svm.SVC(kernel='linear')
    clf.fit(X_train, y_train)
end = time.time()
print('Training completed in {}'.format(end - begin))

if grid_search:
    y_pred = grid.predict(X_test)
else:
    y_pred = clf.predict(X_test)
Example #16
0
# initiate knn
nn = neural_network.MLPClassifier(random_state=123)

# save the parametre features to tune as a dictionary
params = {
    'hidden_layer_sizes': [(10, ), (20, ), (30, ), (40, ), (50, )],
    'activation': ['logistic', 'tanh', 'relu'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'max_iter': [500]
}

# initate the tuning procedure, optimise on accuracy
tunenn = model_selection.GridSearchCV(estimator=nn,
                                      param_grid=params,
                                      scoring='accuracy')

# tune the model
tunenn.fit(X_train, y_train)

# extract the best score
tunenn.best_score_

# extract the best estimator
tunenn.best_estimator_

# extract the best parameters
tunenn.best_params_

# explicitly intiate the tuned model
Example #17
0
    ycols = ['class']
    xcols = list(set(df.columns) - set(ycols))
    X = df.loc[:, xcols].values
    y = np.ravel(df.loc[:, ycols].values)

    # specify cross-validation
    k = 10
    cvsplitter = sm.KFold(n_splits=k, shuffle=True, random_state=0)

    # array of hyperparameter values to test
    alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])
    param_grid = {'alpha': alphas}

    # create the model and execute grid search
    model = sl.Ridge()
    search = sm.GridSearchCV(estimator=model, param_grid=param_grid)
    search.fit(X, y)
    print('Grid search score of best hyperparameter value: {0:.4f}'.format(
        search.best_score_))
    print('Grid search best hyperparameter value: {0:.4f}'.format(
        search.best_estimator_.alpha))
    print('Grid search best hyperparameter values:')
    for tpl in search.best_params_.items():
        print('    {0:<10}: {1:.6f}'.format(tpl[0], tpl[1]))
    print('')

    # use randomized search by selecting hyperparameter values randomly from a
    # uniform distribution 100 times
    param_grid = {'alpha': stats.uniform()}
    search = sm.RandomizedSearchCV(estimator=model,
                                   param_distributions=param_grid,
Example #18
0
File: model.py Project: IvanFei/UBI
def model(X, Y):

    #    MLA = [
    #    # ensemble Model
    #    ensemble.AdaBoostRegressor(),
    #    ensemble.GradientBoostingRegressor(),
    #    ensemble.ExtraTreesRegressor(),
    #
    #    #GLM
    #    linear_model.SGDRegressor(),
    #
    #    #SVM
    #    svm.NuSVR(),
    #    svm.SVR(),
    #
    #    #xgboost
    #    XGBRegressor()
    #    ]

    # data splite
    split = model_selection.ShuffleSplit(n_splits=10, random_state=0)

    grid_n_estimator = [50, 100, 300, 500, 800, 1000]
    grid_ratio = [.1, .25, .5, .75, 1.0]
    grid_learn = [.01, .03, .05, .1, .25]
    grid_max_depth = [2, 4, 6, 8, 10, None]
    #grid_criterion = ['gini', 'entropy']
    grid_seed = [0]
    n_jobs = 2

    vote_est = [
        #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
        ('ada', ensemble.AdaBoostRegressor()),
        ('bc', ensemble.BaggingRegressor(n_jobs=n_jobs)),
        ('gbc', ensemble.GradientBoostingRegressor()),
        ('rfc', ensemble.RandomForestRegressor(n_jobs=n_jobs)),
        ('etc', ensemble.ExtraTreesRegressor(n_jobs=n_jobs)),

        #SVM: http://scikit-learn.org/stable/modules/svm.html
        ('svc', svm.SVR()),

        #MLPRegressor
        ('mlp',
         neural_network.MLPRegressor(hidden_layer_sizes=(
             50,
             100,
             20,
         ),
                                     max_iter=1000)),

        #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        ('xgb', XGBRegressor())
    ]

    grid_param = [
        [{
            #AdaBoostRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
            'n_estimators': grid_n_estimator,  #default=50
            'learning_rate': grid_learn,  #default=1
            #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
            'random_state': grid_seed
        }],
        [{
            #BaggingRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
            'n_estimators': grid_n_estimator,  #default=10
            'max_samples': grid_ratio,  #default=1.0
            'random_state': grid_seed
        }],
        [{
            #GradientBoostingRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
            #'loss': ['deviance', 'exponential'], #default=’deviance’
            'learning_rate': [
                .05
            ],  #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
            'n_estimators': [
                300
            ],  #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
            #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
            'max_depth': grid_max_depth,  #default=3   
            'random_state': grid_seed
        }],
        [{
            #RandomForestRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
            'n_estimators': grid_n_estimator,  #default=10
            #'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth,  #default=None
            'oob_score': [
                True
            ],  #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
            'random_state': grid_seed
        }],
        [{
            #ExtraTreesRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
            'n_estimators': grid_n_estimator,  #default=10
            #'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth,  #default=None
            'random_state': grid_seed
        }],
        [{
            #SVR - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
            #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
            #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'C': [1, 2, 3, 4, 5],  #default=1.0
            'gamma': grid_ratio,  #edfault: auto
        }],
        [{
            #MLP regressor
            'activation': ['logistic', 'relu'],
            'random_state': grid_seed,
        }],
        [{
            #XGBRegressor - http://xgboost.readthedocs.io/en/latest/parameter.html
            'learning_rate': grid_learn,  #default: .3
            'max_depth': [1, 2, 4, 6, 8, 10],  #default 2
            'n_estimators': grid_n_estimator,
            'seed': grid_seed
        }]
    ]

    for rlf, param in zip(vote_est, grid_param):

        best_search = model_selection.GridSearchCV(
            estimator=rlf[1],
            param_grid=param,
            cv=split,
            scoring='neg_mean_squared_error',
            n_jobs=n_jobs)
        best_search.fit(X, Y)

        best_param = best_search.best_params_
        bestIndex = best_search.best_index_
        trainBestScore = best_search.cv_results_['mean_train_score'][bestIndex]
        testBestScore = best_search.best_score_
        print('The best parameter for {} is {}.'.format(
            rlf[1].__class__.__name__, best_param))
        print('The train best score {:.3f}'.format(trainBestScore))
        print('The test best score {:.3f}'.format(testBestScore))

        rlf[1].set_params(**best_param)

    return vote_est
Example #19
0
],
                  axis=1,
                  inplace=True)

#split titanic data as train and test
X_train = titanic_all1.iloc[0:891]
X_train.shape
X_train.info()
y_train = titanic_all['Survived'].iloc[0:891]

parameter_grid = dict(n_estimators=[300, 400],
                      criterion=['gini', 'entropy'],
                      max_features=[3, 4, 5, 6, 7, 8])
rf_estimator = ensemble.RandomForestClassifier(random_state=100)
rf_grid_estimator = model_selection.GridSearchCV(estimator=rf_estimator,
                                                 param_grid=parameter_grid,
                                                 cv=10,
                                                 verbose=1,
                                                 n_jobs=10,
                                                 refit=True)
rf_grid_estimator.fit(X_train, y_train)
rf_grid_estimator.grid_scores_

X_test = titanic_all1.iloc[891:1309]
#number of features in test data mismatches with train data
titanic_test['Survived'] = rf_grid_estimator.predict(X_test)
titanic_test['Survived'] = titanic_test['Survived'].map(lambda x: int(x))
titanic_test.to_csv("submission.csv",
                    columns=['PassengerId', 'Survived'],
                    index=False)
def get_top_n_features(X, Y, top_n_features, col):
    # randomforest
    rf_est = RandomForestClassifier(random_state=0)
    rf_param_grid = {
        'n_estimators': [500],
        'min_samples_split': [2, 3],
        'max_depth': [20]
    }
    rf_grid = model_selection.GridSearchCV(rf_est,
                                           rf_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1,
                                           scoring="recall")
    rf_grid.fit(X, Y)
    print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
    print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
    print('Top N Features RF Train Score:' + str(rf_grid.score(X, Y)))
    feature_imp_sorted_rf = pd.DataFrame({
        'feature':
        col,
        'importance':
        rf_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)

    rf_1 = feature_imp_sorted_rf[:10]
    rf_2 = 100 * feature_imp_sorted_rf[:10]['importance']
    #print(rf_2)
    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
    print('Sample 10 Feeatures from RF Classifier')
    print(str(features_top_n_rf[:10]))
    '''
    pos = np.arange(rf_2.shape[0]) + 0.5

    plt.figure(1, figsize = (18, 8))

    plt.subplot(121)
    plt.barh(pos, rf_1['importance'][::-1])
    plt.yticks(pos, rf_1['feature'][::-1])
    plt.xlabel('Relative Importance')
    plt.title('RandomForest Feature Importance')
    '''

    # AdaBoost
    ada_est = AdaBoostClassifier(random_state=0)
    ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est,
                                            ada_param_grid,
                                            n_jobs=25,
                                            cv=10,
                                            verbose=1,
                                            scoring="recall")
    ada_grid.fit(X, Y)
    print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
    print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
    print('Top N Features Ada Train Score:' + str(ada_grid.score(X, Y)))
    feature_imp_sorted_ada = pd.DataFrame({
        'feature':
        col,
        'importance':
        ada_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    '''
    ada_1=feature_imp_sorted_ada[:10]
    ada_2= 100*feature_imp_sorted_ada[:10]['importance']
    #plt.figure(1, figsize = (18, 8))
    plt.subplot(122)
    plt.barh(pos, ada_1['importance'][::-1])
    plt.yticks(pos, ada_1['feature'][::-1])
    plt.xlabel('Relative Importance')
    plt.title('Adaboost Feature Importance')
    plt.show()'''
    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
    print('Sample 10 Features from Ada Classifier:')
    print(str(features_top_n_ada[:10]))

    # ExtraTree
    et_est = ExtraTreesClassifier(random_state=0)
    et_param_grid = {
        'n_estimators': [500],
        'min_samples_split': [3, 4],
        'max_depth': [20]
    }
    et_grid = model_selection.GridSearchCV(et_est,
                                           et_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1,
                                           scoring="recall")
    et_grid.fit(X, Y)
    print('Top N Features Best ET Params:' + str(et_grid.best_params_))
    print('Top N Features Best DT Score:' + str(et_grid.best_score_))
    print('Top N Features ET Train Score:' + str(et_grid.score(X, Y)))
    feature_imp_sorted_et = pd.DataFrame({
        'feature':
        col,
        'importance':
        et_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    '''
    et_1=feature_imp_sorted_et[:10]
    et_2= 100*feature_imp_sorted_et[:10]['importance']
    
    plt.figure(1, figsize = (18, 8))
    pos = np.arange(et_2.shape[0]) + 0.5
    plt.subplot(121)
    plt.barh(pos, et_1['importance'][::-1])
    plt.yticks(pos, et_1['feature'][::-1])
    plt.xlabel('Relative Importance')
    plt.title('ExtraTrees Feature Importance')
    '''
    features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
    print('Sample 10 Features from ET Classifier:')
    print(str(features_top_n_et[:10]))

    # GradientBoosting
    gb_est = GradientBoostingClassifier(random_state=0)
    gb_param_grid = {
        'n_estimators': [500],
        'learning_rate': [0.01, 0.1],
        'max_depth': [20]
    }
    gb_grid = model_selection.GridSearchCV(gb_est,
                                           gb_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1,
                                           scoring="recall")
    gb_grid.fit(X, Y)
    print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
    print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
    print('Top N Features GB Train Score:' + str(gb_grid.score(X, Y)))
    feature_imp_sorted_gb = pd.DataFrame({
        'feature':
        col,
        'importance':
        gb_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    '''
    gb_1=feature_imp_sorted_gb[:10]
    gb_2= 100*feature_imp_sorted_gb[:10]['importance']
    
    #plt.figure(1, figsize = (18, 8))
    
    plt.subplot(122)
    pos = np.arange(gb_2.shape[0]) + 0.5
    plt.barh(pos, et_1['importance'][::-1])
    plt.yticks(pos, et_1['feature'][::-1])
    plt.xlabel('Relative Importance')
    plt.title('GradientBoosting Feature Importance')
    plt.show()'''
    features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']
    print('Sample 10 Feature from GB Classifier:')
    print(str(features_top_n_gb[:10]))

    # DecisionTree
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]}
    dt_grid = model_selection.GridSearchCV(dt_est,
                                           dt_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1,
                                           scoring="recall")
    dt_grid.fit(X, Y)
    print('Top N Features Bset DT Params:' + str(dt_grid.best_params_))
    print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
    print('Top N Features DT Train Score:' + str(dt_grid.score(X, Y)))
    feature_imp_sorted_dt = pd.DataFrame({
        'feature':
        col,
        'importance':
        dt_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)

    dt_1 = feature_imp_sorted_gb[:10]
    dt_2 = 100 * feature_imp_sorted_gb[:10]['importance']

    plt.figure(1, figsize=(18, 8))
    pos = np.arange(dt_2.shape[0]) + 0.5
    #plt.subplot(121)
    plt.barh(pos, dt_1['importance'][::-1])
    plt.yticks(pos, dt_1['feature'][::-1])
    plt.xlabel('Relative Importance')
    plt.title('DecisionTree Feature Importance')
    features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']
    print('Sample 10 Features from DT Classifier:')
    print(str(features_top_n_dt[:10]))

    # merge the three models
    features_top_n = pd.concat([
        features_top_n_rf, features_top_n_ada, features_top_n_et,
        features_top_n_gb, features_top_n_dt
    ],
                               ignore_index=True).drop_duplicates()
    features_importance = pd.concat([
        feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et,
        feature_imp_sorted_gb, feature_imp_sorted_dt
    ],
                                    ignore_index=True)

    plt.show()

    return features_top_n, features_importance
# svd = TruncatedSVD(n_components=500, random_state=42)
# x_train = svd.fit_transform(x_train)
# x_test = svd.transform(x_test)

## Naive Bayes classifier
clf_NB = MultinomialNB()

pipeline = Pipeline([('tfidf', tfidf_vect), ('nb', clf_NB)])

parameters = {
    'tfidf__ngram_range': ((1, 1), (1, 2)),
    'tfidf__max_df': (0.1, 0.5, 0.6, 1.0),
    'nb__alpha': (0.01, 0.1, 0.6, 1.0),
}

grid = model_selection.GridSearchCV(pipeline, parameters)

clf = grid
clf.fit(x_train, y_train)
y_hat = clf.predict(x_test)

print("grid search params for %s: " % type(clf_NB).__name__)
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print "\t%s: %r" % (param_name, best_parameters[param_name])
print "NB accuracy: ", accuracy_score(y_test, y_hat)
print "NB recall: ", recall_score(y_test, y_hat, average='weighted')
print "NB precision: ", precision_score(y_test, y_hat, average='weighted')

print "Labels:"
print labels
    )  #Initialize with preprocess variations to test
    for key in model_params.keys():
        if model_name in key:
            filtered_params[key] = model_params[key]

    cv_splitter = model_selection.KFold(n_splits=folds,
                                        random_state=FIX_RAND_STATE)

    scorer = metrics.make_scorer(
        metrics.mean_squared_error)  #, average = score_avg
    performance_metric_name = scorer.__str__().rstrip(')').split('(')[
        1]  # Extract scorer metric name

    gs = model_selection.GridSearchCV(estimator=pipe,
                                      param_grid=filtered_params,
                                      cv=cv_splitter,
                                      n_jobs=-1,
                                      scoring=scorer)
    print(filtered_params)

    gs.fit(X_train, np.log1p(y_train))
    scores[model_name] = None
    scores[model_name] = {'best_score': gs.best_score_}
    print("Best {}: {:.4f} with params: {}: ".format(performance_metric_name,
                                                     gs.best_score_,
                                                     gs.best_params_))

    pipe.steps.pop()  #Pop model in turn from pipe

    # Store best model results to plot i.e  those corresponding to best model params
    cv_results_df = pd.DataFrame(gs.cv_results_)
Example #23
0
titanic_train1.head(6)

#feature engineering
X_train = titanic_train1.drop(
    ['PassengerId', 'Cabin', 'Ticket', 'Name', 'Survived'], 1)
y_train = titanic_train['Survived']

#build the decision tree model
dt = tree.DecisionTreeClassifier(criterion='entropy')

dt_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(3, 15)),
    'min_samples_split': [2, 3, 6, 7, 8]
}
param_grid = model_selection.GridSearchCV(dt, dt_grid,
                                          cv=10)  #Evolution of tee
param_grid.fit(X_train, y_train)  #Building the tree
print(param_grid.best_score_)  #Best score
print(param_grid.best_params_)
print(param_grid.score(X_train, y_train))  #train score  #Evolution of tree

#use cross validation to estimate performance of model.
#==============================================================================
# cv_scores = model_selection. (dt, X_train, y_train, cv=5, verbose=3)
# cv_scores.mean()
#==============================================================================

#build final model on entire train data which is us for prediction
#dt.fit(X_train,y_train)

# natively deploy decision tree model(pickle format)
Example #24
0
X_test = scaler.transform(X_test)

'''
Modelling.
    Train, validation separation
    Neural network
    Confusion matrix (target is accuracy)
    Error/accuracy plots
'''
# Split the train and the validation set for the fitting.
X_train, X_val, y_train, y_val = ms.train_test_split(X_train, y_train, test_size=0.2)  

# Tuning model hyperparameters.
param_grid = {"alpha": [0.001, 0.003, 0.01]}
nn_clf = MLPClassifier(hidden_layer_sizes=(200,200,200), max_iter=200)
nn_clf = ms.GridSearchCV(estimator=nn_clf, param_grid=param_grid, cv=4, scoring='accuracy')
nn_clf.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
print('Optimum hyperparameters are,')
print(nn_clf.best_params_)

# Fit the neural net model. Used GridSearchCV to find the optimum alpha.
nn_mod = MLPClassifier(hidden_layer_sizes=(500,500,500), alpha=nn_clf.best_params_['alpha'], max_iter=400)
nn_mod.fit(X_train, y_train)
val_scores = nn_mod.predict(X_val)

# Compute confusion matrix.
conf_mtx = sklm.confusion_matrix(y_val, val_scores)

# Plot the confusion matrix.
plot_confusion_matrix(conf_mtx, classes = nn_mod.classes_) 
Example #25
0
tmp1 = ohe.transform(titanic_train[categorical_feature]).toarray()
tmp1 = pd.DataFrame(tmp1)
continuous_features = ['Fare', 'Age', 'SibSp', 'Parch']
tmp2 = titanic_train[continuous_features]
tmp = pd.concat([tmp1, tmp2], axis=1)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(tmp)
y_train = titanic_train['Survived']

knn_estimator = neighbors.KNeighborsClassifier()
Knn_grid = {
    'n_neighbors': [5, 7, 8, 10, 20, 25, 30],
    'weights': ['uniform', 'distance']
}
knn_grid_estimator = model_selection.GridSearchCV(knn_estimator,
                                                  Knn_grid,
                                                  cv=10,
                                                  return_train_score='True')
knn_grid_estimator.fit(X_train, y_train)
print(knn_grid_estimator.best_estimator_)
print(knn_grid_estimator.best_score_)
print(knn_grid_estimator.best_params_)
results = knn_grid_estimator.cv_results_
final_estimator = knn_grid_estimator.best_estimator_
print(final_estimator.score(X_train, y_train))

#read test data
titanic_test = pd.read_csv(
    'C:\\Users\\tauseef.ur.rahman\\Desktop\\Python-Docs\\Titanic\\test.csv')
titanic_test[imputable_cont_features] = cont_imputer.transform(
    titanic_test[imputable_cont_features])
titanic_test['Embarked'] = cat_imputer.transform(titanic_test['Embarked'])
#scale all the columns with z-scores
mapper = DataFrameMapper([(titanic_all1.columns,
                           preprocessing.StandardScaler())])
scaled_features = mapper.fit_transform(titanic_all1)
type(scaled_features)
titanic_all2 = pd.DataFrame(scaled_features, columns=titanic_all1.columns)

pca = decomposition.PCA(45)
pca.fit(titanic_all2)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())
titanic_all3 = pd.DataFrame(pca.transform(titanic_all2))

#split titanic data as train and test
X_train = titanic_all3.iloc[0:891]
X_train.shape
X_train.info()
y_train = titanic_all['Survived'].iloc[0:891]

parameter_grid = dict(n_neighbors=[3, 4, 5, 6, 7],
                      weights=['uniform', 'distance'])
knn_estimator = neighbors.KNeighborsClassifier()
knn_grid_estimator = model_selection.GridSearchCV(estimator=knn_estimator,
                                                  param_grid=parameter_grid,
                                                  cv=10,
                                                  verbose=1,
                                                  n_jobs=10)
knn_grid_estimator.fit(X_train, y_train)
knn_grid_estimator.grid_scores_
Example #27
0
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y,
                       top_n_features):
    # random forest
    rf_est = RandomForestClassifier(random_state=0)
    rf_param_grid = {
        'n_estimators': [100],
        'min_samples_split': [2, 3],
        'max_depth': [20]
    }
    rf_grid = model_selection.GridSearchCV(rf_est,
                                           rf_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
    print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
    print('Top N Features RF Train Score:' +
          str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_rf = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        rf_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_rf = feature_imp_sorted_rf.head(
        top_n_features)['feature'][:top_n_features]
    print('Sample 10 Features from RF Classifier')
    print(str(features_top_n_rf[:top_n_features]))

    # AdaBoost
    ada_est = AdaBoostClassifier(random_state=0)
    ada_param_grid = {'n_estimators': [100], 'learning_rate': [0.01, 0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est,
                                            ada_param_grid,
                                            n_jobs=25,
                                            cv=10,
                                            verbose=1)
    ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
    print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
    print('Top N Features Ada Train Score:' +
          str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_ada = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        ada_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_ada = feature_imp_sorted_ada.head(
        top_n_features)['feature'][:top_n_features]
    print('Sample 10 Feature from Ada Classifier:')
    print(str(features_top_n_ada[:top_n_features]))

    # ExtraTree
    et_est = ExtraTreesClassifier(random_state=0)
    et_param_grid = {
        'n_estimators': [100],
        'min_samples_split': [3, 4],
        'max_depth': [20]
    }
    et_grid = model_selection.GridSearchCV(et_est,
                                           et_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best ET Params:' + str(et_grid.best_params_))
    print('Top N Features Best ET Score:' + str(et_grid.best_score_))
    print('Top N Features ET Train Score:' +
          str(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_et = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        et_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_et = feature_imp_sorted_et.head(
        top_n_features)['feature'][:top_n_features]
    print('Sample 10 Features from ET Classifier:')
    print(str(features_top_n_et[:top_n_features]))

    # GradientBoosting
    gb_est = GradientBoostingClassifier(random_state=0)
    gb_param_grid = {
        'n_estimators': [100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [20]
    }
    gb_grid = model_selection.GridSearchCV(gb_est,
                                           gb_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
    print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
    print('Top N Features GB Train Score:' +
          str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_gb = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        gb_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_gb = feature_imp_sorted_gb.head(
        top_n_features)['feature'][:top_n_features]
    print('Sample 10 Feature from GB Classifier:')
    print(str(features_top_n_gb[:top_n_features]))

    # DecisionTree
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]}
    dt_grid = model_selection.GridSearchCV(dt_est,
                                           dt_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best DT Params:' + str(dt_grid.best_params_))
    print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
    print('Top N Features DT Train Score:' +
          str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_dt = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        dt_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_dt = feature_imp_sorted_dt.head(
        top_n_features)['feature'][:top_n_features]
    print('Sample 10 Features from DT Classifier:')
    print(str(features_top_n_dt[:top_n_features]))

    # merge the three models
    features_top_n = pd.concat([
        features_top_n_rf, features_top_n_ada, features_top_n_et,
        features_top_n_gb, features_top_n_dt
    ],
                               ignore_index=True).drop_duplicates()

    features_importance = pd.concat([
        feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et,
        feature_imp_sorted_gb, feature_imp_sorted_dt
    ],
                                    ignore_index=True)

    return features_top_n, features_importance
Example #28
0
def run():
    df = buildDf('qqq')
    #df = buildDf('atvi')
    #This drops the first column which is an extra index
    df = df.drop(columns=df.columns[0])
    df['seq'] = df.index
    import seaborn as sns
    fig, ax = plt.subplots()
    sns.regplot(x='seq', y='Close', data=df, lowess=True)

    #print(df.tail(50).to_string())
    df['Volatility'] = (df['Close'] - df['Open']) / df['Volume']
    fig, ax = plt.subplots()
    sns.heatmap(df.corr(), cmap='Blues')
    #I slice off the first 50 since they are Nan
    #start at 50 for MAcross as MA50 col doesnt start counting until 50
    #The max is len(df)-1 because to calculate updown it reads the next date, since the next one at the end is null this avoids the error
    #X are the features I want
    #Y is the target UpDown which tries to predict based on previous close to close if the next day will go up or down
    X = df[[
        'Open', 'High', 'Low', 'Close', 'Volume', 'MA10', 'MA50', 'Volatility',
        'CrossUpCrossDown'
    ]][50:len(df) - 1].values
    y = df[['UpDown']][50:len(df) - 1].values.astype(int).ravel()

    #This sets the training and tests automatically and makes sure features and targets are distributed well.
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.1)

    print("y_test:" + str(collections.Counter(y_test)))
    print("y_train" + str(collections.Counter(y_test)))
    ###Feature selection
    print("Feature selection LDA")
    selection = RFECV(LinearDiscriminantAnalysis(), scoring='accuracy')
    selection.fit_transform(X_train, y_train)
    print(selection.support_)

    print("Feature selection CART")
    selection = RFECV(LinearDiscriminantAnalysis(), scoring='accuracy')
    selection.fit_transform(X_train, y_train)
    print(selection.support_)

    #I make a list of models to try out, setting the seed so they are they same when ran
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    #I commented out my classifier because it takes like 15 min to run. feel free to try it out though. I didnt set the random seed so it will
    #differ slightly from the KNN but if it is set, it will be the same
    #models.append(('AdricsKnn',AdricsKNNClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    #models.append(('SVM', SVC()))
    parameters = {
        'n_components': [None, 1, 2],
        'n_neighbors': [6, 7, 8, 9],
        'K': [6, 7, 8],
        'C': [.0001, .001, .01, .1, 1, 10, 100],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'gamma': [.0001, .001, .01, .1]
    }
    results = []
    names = []

    #this makes sure the distribution is balanced and sets up the results
    for name, model in models:
        param_grid = {}

        #this takes the keys and checks to make sure it doesnt pass an incorrect one to a model.
        for k in parameters.keys():
            if k in model.get_params().keys():
                param_grid[k] = parameters[k]

#I do the grid search here to find the best parameters based from the parameters above
        gs = model_selection.GridSearchCV(model,
                                          param_grid,
                                          cv=5,
                                          scoring='accuracy')
        gs.fit(X_train, y_train)

        #This gives the results after cross validating so that I can plot them on graph
        cv_results = model_selection.cross_val_score(gs,
                                                     X_train,
                                                     y_train,
                                                     cv=5,
                                                     scoring='accuracy')
        names.append(name)
        msg = "Model:\n%s  \n%s: %f (%f)" % (
            gs.best_estimator_, name, cv_results.mean(), cv_results.std())
        results.append(cv_results)
        print(msg)

#This shows and compares results on a graph
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()

    #This is one model and testing to see the results so that they can be compared and predicts with clean data
    print("LinearDiscriminantAnalysis")
    model = LinearDiscriminantAnalysis(n_components=None)
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    print("PREDICTION LDR IS:::::::::::::::::::::::::\n")
    print(predicted)
    print("LDR actual:::::::::::::::::::")
    print(y_test.reshape((1, len(y_test))))
    print(collections.Counter(y_test))
    error = np.mean(predicted != y_test)
    print("Accuracy: " + str(1 - error))

    print("Cart")
    model = DecisionTreeClassifier(max_depth=3)
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    print("PREDICTION LDR IS:::::::::::::::::::::::::\n")
    print(predicted)
    print("LDR actual:::::::::::::::::::")
    print(y_test.reshape((1, len(y_test))))
    print(collections.Counter(y_test))
    error = np.mean(predicted != y_test)
    print("Accuracy: " + str(1 - error))

    #
    #    prints tuples lined up
    #    print(list(zip(a[175:190],b[175:190])))

    neuro1(X_train, X_test, y_train, y_test)
Example #29
0
        delimiter=',')
    #gram_test = np.loadtxt('D:/Study/Bioinformatics/补实验/QSP/kernels/K_test_'+name+'.csv', delimiter = ',')

    y_train = np.loadtxt(
        'D:/Study/Bioinformatics/补实验/QSP/features/train_label.csv',
        delimiter=',')
    #y_test = np.loadtxt('D:/Study/Bioinformatics/补实验/QSP/features/test_label.csv', delimiter = ',')

    cv = model_selection.StratifiedKFold(n_splits=10,
                                         shuffle=True,
                                         random_state=0)

    parameters = {'C': np.logspace(-15, 10, base=2, num=52)}
    grid = model_selection.GridSearchCV(svm.SVC(kernel='precomputed',
                                                probability=True),
                                        parameters,
                                        n_jobs=-1,
                                        cv=cv,
                                        verbose=2)
    grid.fit(gram_train, y_train)
    C = grid.best_params_['C']
    print('C =', C)

    clf = svm.SVC(C=C, kernel='precomputed', probability=True)

    scorerMCC = metrics.make_scorer(metrics.matthews_corrcoef)
    scorerSP = metrics.make_scorer(specificity_score)
    scorerPR = metrics.make_scorer(metrics.precision_score)
    scorerSE = metrics.make_scorer(metrics.recall_score)

    scorer = {
        'ACC': 'accuracy',
Example #30
0
X_train = titanic2[0:titanic_train.shape[0]]
X_train.shape
X_train.info()
y_train = titanic_train['Survived']

#oob scrore is computed as part of model construction process
dt_estimator = tree.DecisionTreeClassifier()
ada_estimator = ensemble.AdaBoostClassifier(base_estimator=dt_estimator,
                                            random_state=2017)
ada_grid = {
    'n_estimators': [50],
    'learning_rate': [0.01, 0.02, 1.0],
    'base_estimator__max_depth': [3]
}
grid_ada_estimator = model_selection.GridSearchCV(ada_estimator,
                                                  ada_grid,
                                                  cv=10,
                                                  n_jobs=1)
grid_ada_estimator.fit(X_train, y_train)
print(grid_ada_estimator.grid_scores_)
print(grid_ada_estimator.best_score_)
print(grid_ada_estimator.best_params_)
print(grid_ada_estimator.score(X_train, y_train))

#exlore feature importances calculated by decision tree algorithm
features = X_train.columns
importances = grid_ada_estimator.best_estimator_.feature_importances_
fe_df = pd.DataFrame({'feature': features, 'importance': importances})

X_test = titanic2[titanic_train.shape[0]:]
X_test.shape
X_test.info()