os.environ['PATH'].find(dir)
os.environ['PATH'] = dir + ';' + os.environ['PATH']
"""
y_pred = sc.loadmat("conf.mat")["at"]
cm = y_pred.astype('float') / y_pred.sum(axis=1)[:, np.newaxis]
import seaborn as sns
import matplotlib.pyplot as plt

# Sample figsize in inches
mask = np.zeros_like(cm)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax = sns.heatmap(cm, mask=mask, vmax=.3, square=True)

fscore = clf.booster().get_fscore()
import operator
sorted_fscore = sorted(fscore.items(),
                       key=operator.itemgetter(1),
                       reverse=True)
top_20 = []
top_20_names = []
for i in range(20):
    top_20.append(sorted_fscore[i][1])
    top_20_names.append(sorted_fscore[i][0][1:])

fig, ax = plt.subplots(figsize=(10, 10))

y_pos = np.arange(len(top_20_names))

ax.barh(y_pos, top_20, align='center')
class model_tuning_params():

    # xgboost regressor
    def __init__(self, model_name, random_seed=None, params_list=None):
        self.model_name = model_name
        if model_name == 'xgb':
            self.model = xgb.XGBClassifier(learning_rate=0.1,
                                           n_estimators=1000,
                                           max_depth=7,
                                           min_child_weight=1,
                                           gamma=0,
                                           subsample=0.8,
                                           colsample_bytree=0.8,
                                           objective='reg:linear',
                                           nthread=4,
                                           scale_pos_weight=1,
                                           seed=26)
            xgb_params_test1 = {
                "max_depth": [3, 5, 7, 9],
                "min_child_weight": [1, 3, 5]
            }
            xgb_params_test2 = {
                "max_depth": [4, 5, 6],
                "min_child_weight": [4, 5, 6]
            }
            xgb_params_test3 = {"gamma": [i / 10.0 for i in range(0, 5)]}
            xgb_params_test4 = {
                'subsample': [i / 10.0 for i in range(6, 10)],
                'colsample_bytree': [i / 10.0 for i in range(6, 10)]
            }
            xgb_params_test5 = {
                'subsample': [i / 100.0 for i in range(75, 90, 5)]
            }
            xgb_params_test6 = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]}
            xgb_params_test7 = {'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]}
            self.params_list = [
                xgb_params_test1, xgb_params_test2, xgb_params_test3,
                xgb_params_test4, xgb_params_test5, xgb_params_test6,
                xgb_params_test7
            ]

        elif model_name == 'rf':
            self.model = RandomForestClassifier(n_estimators=100,
                                                criterion='mse',
                                                max_features='sqrt',
                                                max_depth=None,
                                                n_jobs=-1,
                                                verbose=3,
                                                random_state=26)
            rf_params_test1 = {'criterion': ['mase', 'mae']}
            rf_params_test2 = {'max_depth': [i for i in range(10, 50, 5)]}
            rf_params_test3 = {
                'min_samples_split': [2, 3, 4, 5],
                'min_samples_leaf': [1, 2, 10, 100]
            }
            rf_params_test4 = {'max_features': ['log2', 'sqrt', 0.2]}
            rf_params_test5 = {
                'max_features': [i / 100.0 for i in range(5, 15)]
            }
            self.params_list = [
                rf_params_test1, rf_params_test2, rf_params_test3,
                rf_params_test4, rf_params_test5
            ]

        elif model_name == 'regress':
            self.model = LogisticRegression()
            regress_params_test1 = {'penalty': ['l1', 'l2']}
            regress_params_test2 = {'multi_class': ['ovr', 'multinomial']}
            regress_params_test3 = {'solver': ['newton-cg', 'sag', 'lbfgs']}
            regress_params_test4 = {'class_weight': ['balanced', 'None']}
            regress_params_test5 = {'C': [0.01, 0.1, 1.0, 10]}

            self.params_list = [
                regress_params_test1, regress_params_test2,
                regress_params_test3, regress_params_test4,
                regress_params_test5
            ]

        elif model_name == 'svm':
            self.model = svm.SVC(C=1.0, kernel='rbf', gamma='auto', verbose=3)

            svm_params_test1 = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
            svm_params_test2 = {
                'kernel': ['poly'],
                'degress': [i for i in range(1, 6)]
            }
            svm_params_test3 = {'C': [0.01, 0.1, 1.0, 10], 'gamma': ['auto']}
            svm_params_test4 = {
                'C': [1.0],
                'gamma': [2**k for k in range(-2, 3, 1)]
            }
            self.params_list = [
                svm_params_test1, svm_params_test2, svm_params_test3,
                svm_params_test4
            ]

        else:
            raise ValueError(
                "not a valid model to tunning parameter \nplease try one of the followings: \n"
                + '-' * 20 + "\n regress \n svm \n xgb \n rf \n" + '-' * 20)

        self.random_seed = np.random.seed(26)

    # search for best n_estimators and return the updated model
    def modelfit_xgb(self,
                     dtrain,
                     useTrainCV=True,
                     cv_folds=5,
                     early_stopping_rounds=50,
                     metric='rmse',
                     obt='reg:linear'):
        predictors = [
            col for col in dtrain.columns.values
            if col not in ['Response', 'Id']
        ]
        target = "Response"
        if useTrainCV:
            xgb_param = self.params_list
            xgb_param['objective'] = obt
            if xgb_param['objective'] == 'multi:softmax':
                xgb_param['num_class'] = 8
                metric = 'merror'
                xgtrain = xgb.DMatrix(dtrain[predictors].values,
                                      label=(dtrain[target] - 1).values)
            xgtrain = xgb.DMatrix(dtrain[predictors].values,
                                  label=dtrain[target].values)
            cvresult = xgb.cv(
                xgb_param,
                xgtrain,
                num_boost_round=self.model.get_params()['n_estimators'],
                nfold=cv_folds,
                metrics=metric,
                early_stopping_rounds=early_stopping_rounds,
                verbose_eval=3)
            self.model.set_params(n_estimators=cvresult.shape[0])

        # Fit the algorithm on the data
        self.model.fit(dtrain[predictors], dtrain[target], eval_metric=metric)

        # Predict training set:
        dtrain_predictions = self.model.predict(dtrain[predictors])

        if self.model._estimator_type == 'regressor':
            dtrain_prediction = np.clip(dtrain_predictions, 1, 8)
            dtrain_predictions = np.round(dtrain_prediction).astype(int)

        # print model report:
        print("\nModel Report")
        print(
            "Accuracy : %.4g" %
            metrics.accuracy_score(dtrain[target].values, dtrain_predictions))

        importance = self.model.booster().get_fscore()
        importance = sorted(importance.items(), key=operator.itemgetter(1))
        # plt.figure()
        df = pd.DataFrame(importance, columns=['feature', 'score'])
        df['score'] = df['score'] / df['score'].sum()
        # df.plot()
        df.plot(kind='barh',
                x='feature',
                y='score',
                legend=False,
                figsize=(6, 10))
        plt.title('XGBoost Feature Importance')
        plt.xlabel('importance value')
        # return model which has optimal n_estimators for a specific learning_rate

        return self.model

    def grid_search(self, data):
        predictors = [
            col for col in data.columns.values
            if col not in ['Response', 'Id']
        ]
        myscorer = make_scorer(quadratic_weighted_kappa,
                               greater_is_better=True)
        print('Grid search for')
        print(self.model_name)
        print('parameters going to be tuned with %s ' % self.params_list)
        print('Could take a long time to go through grid search')

        while True:
            user_enter = input('Continue[y/n]: ')
            if user_enter == 'y':
                break
            elif user_enter == 'n':
                return ('exit parameter grid search for ' + self.model_name +
                        ' model')
            else:
                print('not a valid input, please enter [y/n]')
        target = 'Response'
        gsearch = GridSearchCV(estimator=self.model,
                               param_grid=self.params_list,
                               iid=False,
                               cv=5,
                               scoring=myscorer,
                               n_jobs=-1,
                               verbose=3)
        gsearch.fit(data[predictors], data[target])
        print('\n grid_scores: ', gsearch.grid_scores_)
        print('\n best parameters: ', gsearch.best_params_)
        print('\n best score: ', gsearch.best_score_)

        # update parameters
        for index, value in gsearch.best_params_.items():
            self.model.set_params(**{index: value})

        print('store updated model for reproducible usage')
        joblib.dump(self.model, 'models/%s.pkl' % self.model_name)

        return self.model