Ejemplo n.º 1
0
    def objective(**params):

        if name_algo != 'oct':
            model = algorithm()
            model.set_params(**params)
            score = np.mean(
                cross_val_score(model,
                                X,
                                y,
                                cv=cv,
                                n_jobs=-1,
                                scoring="roc_auc"))

        else:
            from julia.api import Julia
            jl = Julia(compiled_modules=False)
            from interpretableai import iai

            params["max_depth"] = int(params["max_depth"])
            grid = iai.GridSearch(iai.OptimalTreeClassifier(random_seed=1),
                                  **params)

            grid.fit_cv(X, y, n_folds=cv, validation_criterion='auc')
            score = float(grid.get_grid_results()[[
                'split' + str(i) + '_valid_score' for i in range(1, cv + 1)
            ]].T.mean())

        return -score
def train_oct(X_train, y_train, X_test, y_test, output_path, seed=1):
    from julia.api import Julia
    jl = Julia(compiled_modules=False)
    from interpretableai import iai

    X_train = impute_missing(X_train)
    X_test = impute_missing(X_test)

    oct_grid = iai.GridSearch(
        iai.OptimalTreeClassifier(random_seed=seed, ),
        max_depth=range(1, 10),
        # minbucket=[5, 10, 15, 20, 25, 30, 35],
        criterion=['gini', 'entropy', 'misclassification'],
        ls_num_tree_restarts=200,
    )
    oct_grid.fit_cv(X_train, y_train, n_folds=5, validation_criterion='auc')
    best_learner = oct_grid.get_learner()
    best_learner.write_json('%s/learner.json' % output_path)
    best_learner.write_questionnaire('%s/app.html' % output_path)
    best_learner.write_html('%s/tree.html' % output_path)
    best_learner.write_png('%s/tree.png' % output_path)
    in_auc = oct_grid.score(X_train, y_train, criterion='auc')
    out_auc = oct_grid.score(X_test, y_test, criterion='auc')
    in_mis = oct_grid.score(X_train, y_train, criterion='misclassification')
    out_mis = oct_grid.score(X_test, y_test, criterion='misclassification')
    print('In Sample AUC', in_auc)
    print('Out of Sample AUC', out_auc)
    print('In Sample Misclassification', in_mis)
    print('Out of Sample Misclassification', out_mis)
    return best_learner, in_auc, out_auc, in_mis, out_mis
Ejemplo n.º 3
0
    def _tune_best_tree(self):
        """
        Tune the tree and then output it as a json file
        :return:
        """
        df_X, df_Y = self._getX_Y()

        # we decompose the tree
        (train_X, train_y), (test_X, test_y) = iai.split_data('classification',
                                                              df_X,
                                                              df_Y,
                                                              seed=1)
        other_colum = list(train_X.columns).copy()
        other_colum.remove('iteration')
        weight_train_X = np.array(train_X['iteration'])
        train_X = train_X[other_colum]
        weight_test_X = np.array(test_X['iteration'])
        test_X = test_X[other_colum]

        grid = iai.GridSearch(
            iai.OptimalTreeClassifier(random_seed=1),
            max_depth=[8],
            # max_depth=[3],
            minbucket=[10])
        # grid.get_learner().set_params(show_progress=False)
        grid.fit(train_X, train_y)

        score_train = grid.score(train_X,
                                 train_y,
                                 criterion='misclassification')
        score_test = grid.score(test_X, test_y, criterion='misclassification')

        learner = grid.get_learner()
        learner.write_json(useful_paths.TREE_JSON_CVRPTW)
        if 'JULIE' in local_path.NAME_MACHINE:
            learner.write_png(useful_paths.PATH_TO_CONFIGURATION +
                              'saved_trees/tree_cvrptw.png')

        self.score_test = score_test
        self.score_train = score_train
        print("Accuracy ", score_test, score_train)

        # output the dispersion features corresponding to the df
        f = open(useful_paths.DISPERSION_FEATURES_JSON, 'w')
        json.dump(self.dict_dispersion, f)
        f.close()
Ejemplo n.º 4
0
train_X, Z, y = u.load_data(data_path,
                            training_set_name,
                            split=data_version,
                            matched=matched,
                            prediction=outcome,
                            other_tx=False,
                            replace_na='NO_' + treatment)

df_result = pd.read_csv(save_path + data_version + '_' + match_status +
                        '_bypatient_allmethods_benefit.csv')
benefit = df_result.groupby('ID').agg({'Benefit': 'mean'})['Benefit']

### Run Model
grid = iai.GridSearch(
    iai.OptimalTreeRegressor(random_seed=1, ),
    max_depth=range(3, 8),
)

grid.fit(X, benefit)

lnr = grid.get_learner()
lnr.write_html(save_path + data_version + '_benefit_tree.html')

grid.score(X, benefit['Benefit'], criterion='mse')

for data_version in ['train', 'test', 'validation_all']:
    X, Z, y = u.load_data(data_path,
                          training_set_name,
                          split=data_version,
                          matched=matched,
                          prediction=outcome,
Ejemplo n.º 5
0
    def objective(trial):

        if name_algo == 'xgboost':
            params = {
                "n_estimators":
                trial.suggest_int("n_estimators", 10, 900),
                "learning_rate":
                trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
                "max_depth":
                trial.suggest_int("max_depth", 3, 10),
                "min_child_weight":
                trial.suggest_uniform("min_child_weight", 1e-8, 1.0),
                "gamma":
                trial.suggest_uniform("gamma", 1e-8, 5),
                "colsample_bytree":
                trial.suggest_uniform("colsample_bytree", 1e-2, 1),
                "lambda":
                trial.suggest_uniform("lambda", 1e-8, 5),
                "alpha":
                trial.suggest_uniform("alpha", 1e-8, 5),
                "eval_metric":
                "auc"
            }

            dtrain = xgb.DMatrix(X, label=y)
            pruning_callback = optuna.integration.XGBoostPruningCallback(
                trial, "test-auc")
            history = xgb.cv(params,
                             dtrain,
                             num_boost_round=100,
                             callbacks=[pruning_callback],
                             nfold=cv,
                             stratified=True)
            score = history["test-auc-mean"].values[-1]
            return score

        elif name_algo == 'rf':
            params = {
                "n_estimators":
                trial.suggest_int("n_estimators", 10, 900),
                "max_depth":
                trial.suggest_int("max_depth", 3, 10),
                "min_samples_leaf":
                trial.suggest_uniform("min_samples_leaf", 1e-5, 0.5),
                "min_samples_split":
                trial.suggest_uniform("min_samples_split", 1e-5, 0.5),
                "max_features":
                trial.suggest_categorical("max_features", ['sqrt', 'log2'])
            }

        elif name_algo == 'cart':
            params = {
                "max_depth":
                trial.suggest_int("max_depth", 3, 10),
                "min_weight_fraction_leaf":
                trial.suggest_uniform("min_weight_fraction_leaf", 0, 0.5),
                "min_samples_leaf":
                trial.suggest_uniform("min_samples_leaf", 1e-5, 0.5),
                "min_samples_split":
                trial.suggest_uniform("min_samples_split", 1e-5, 0.5),
                "min_impurity_decrease":
                trial.suggest_uniform("min_impurity_decrease", 0, 1),
                "criterion":
                trial.suggest_categorical("criterion", ['gini', 'entropy'])
            }

        elif name_algo == 'lr':
            params = {
                "penalty": trial.suggest_categorical("penalty", ['l2']),
                "tol": trial.suggest_uniform("tol", 1e-10, 1),
                "C": trial.suggest_uniform("C", 1e-10, 10),
                "solver": trial.suggest_categorical("solver", ['lbfgs']),
                "max_iter": 1000
            }

        elif name_algo == 'oct':

            from interpretableai import iai

            params = {
                "max_depth":
                trial.suggest_int("max_depth", 3, 10),
                "criterion":
                trial.suggest_categorical(
                    "criterion", ['gini', 'entropy', 'misclassification']),
                "minbucket":
                trial.suggest_uniform("minbucket", 10**-6, 0.4),
                "cp":
                trial.suggest_uniform("cp", 10**-12, 0.7)
            }

            params["max_depth"] = int(params["max_depth"])
            grid = iai.GridSearch(iai.OptimalTreeClassifier(random_seed=1),
                                  **params)

            grid.fit_cv(X, y, n_folds=cv, validation_criterion='auc')
            score = float(grid.get_grid_results()[[
                'split' + str(i) + '_valid_score' for i in range(1, cv + 1)
            ]].T.mean())
            return score

        elif name_algo == 'kn':
            params = {
                "n_neighbors":
                trial.suggest_int("n_neighbors", 1, 80),
                "weights":
                trial.suggest_categorical("weights", ['uniform', 'distance']),
                "algorithm":
                trial.suggest_categorical("algorithm",
                                          ['ball_tree', 'kd_tree']),
                "leaf_size":
                trial.suggest_int("leaf_size", 10, 100),
                "p":
                trial.suggest_int("p", 1, 10)
            }

        elif name_algo == 'svm':
            params = {
                "C": trial.suggest_uniform("C", 1e-10, 25),
                "kernel": trial.suggest_categorical("kernel", ['poly', 'rbf']),
                "degree": trial.suggest_int("degree", 1, 5),
                "probability": trial.suggest_int("probability", 1, 1),
                "coef0": trial.suggest_uniform("coef0", -5, 5)
            }

        elif name_algo == 'gb':
            params = {
                "var_smoothing":
                trial.suggest_uniform("var_smoothing", 1e-13, 1e-5)
            }

        elif name_algo == 'mlp':
            params = {
                "activation":
                trial.suggest_categorical("activation", ['tanh', 'relu']),
                "solver":
                trial.suggest_categorical("solver", ['lbfgs', 'adam']),
                "alpha":
                trial.suggest_uniform("alpha", 0, 10),
                "learning_rate":
                trial.suggest_categorical("learning_rate",
                                          ['constant', 'adaptive']),
                "tol":
                trial.suggest_uniform("tol", 1e-10, 1),
                "max_iter":
                trial.suggest_int("max_iter", 1000, 1000)
            }

        elif name_algo == 'qda':
            params = {
                "reg_param": trial.suggest_uniform("reg_param", 1e-10, 1),
                "tol": trial.suggest_uniform("tol", 1e-10, 1)
            }

        # Add a callback for pruning.
        model = algorithm()
        model.set_params(**params)
        score = np.mean(
            cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring="roc_auc"))
        #score = np.quantile(cross_val_score(model, X, y, cv = cv, n_jobs = -1, scoring="roc_auc"), 0.25)

        return score