Example #1
0
def train_heart_disease(**kwargs):
    from hypernets.tabular.datasets import dsutils
    from sklearn.model_selection import train_test_split

    X = dsutils.load_heart_disease_uci()
    y = X.pop('target')

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=randint())
    X_train, X_eval, y_train, y_eval = \
        train_test_split(X_train, y_train, test_size=0.3, random_state=randint())

    kwargs = {'reward_metric': 'auc', 'max_trials': 10, **kwargs}
    hm, model = train(X_train, y_train, X_eval, y_eval, const.TASK_BINARY, **kwargs)

    print('-' * 50)
    scores = model.evaluate(X_test, y_test, metrics=['auc', 'accuracy', 'f1', 'recall', 'precision'])
    print('scores:', scores)

    trials = hm.get_top_trials(10)
    models = [hm.load_estimator(t.model_file) for t in trials]

    msgs = [f'{t.trial_no},{t.reward},{m.cls.__name__} {m.model_args}' for t, m in zip(trials, models)]
    print('top trials:')
    print('\n'.join(msgs))
Example #2
0
    def detect(self, X, method=None):
        X_shape = X.shape
        sample_limit = cfg.multi_collinearity_sample_limit
        if X_shape[0] > sample_limit:
            logger.info(
                f'{X_shape[0]} rows data found, sample to {sample_limit}')
            frac = sample_limit / X_shape[0]
            from . import get_tool_box
            X, _, = get_tool_box(X).train_test_split(X,
                                                     train_size=frac,
                                                     random_state=randint())

        n_values = self._value_counts(X)
        one_values = [n.name for n in n_values if len(n) <= 1]
        if len(one_values) > 0:
            X = X[[c for c in X.columns if c not in one_values]]

        logger.info('computing correlation')
        corr = self._corr(X, method)

        logger.info('computing cluster')
        corr_linkage = hierarchy.ward(corr)
        cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance')
        cluster_id_to_feature_ids = defaultdict(list)
        for idx, cluster_id in enumerate(cluster_ids):
            cluster_id_to_feature_ids[cluster_id].append(idx)
        selected = [
            X.columns[v[0]] for v in cluster_id_to_feature_ids.values()
        ]
        unselected = list(set(X.columns.to_list()) -
                          set(selected)) + one_values
        feature_clusters = [[X.columns[i] for i in v]
                            for v in cluster_id_to_feature_ids.values()]

        return feature_clusters, selected, unselected
Example #3
0
 def dtr(self):
     return dict(
         cls=DecisionTreeRegressor,
         splitter=Choice(["best", "random"]),
         max_depth=Choice([None, 3, 5, 10, 20, 50]),
         random_state=randint(),
     )
Example #4
0
 def dt(self):
     return dict(
         cls=DecisionTreeClassifier,
         criterion=Choice(["gini", "entropy"]),
         splitter=Choice(["best", "random"]),
         max_depth=Choice([None, 3, 5, 10, 20, 50]),
         random_state=randint(),
     )
Example #5
0
 def __init__(self, preprocessor=None, estimator=None, random_state=None):
     self.preprocessor = preprocessor
     self.estimator_ = estimator
     self.random_state = random_state if random_state is not None else randint(
     )
     self.auc_ = None
     self.feature_names_ = None
     self.feature_importances_ = None
     self.fitted = False
Example #6
0
    def prepare_data(self):
        if self.task == const.TASK_BINARY:
            X, y = make_classification(n_samples=self.n_samples,
                                       n_features=self.n_features,
                                       n_classes=2,
                                       random_state=randint())
        elif self.task == const.TASK_MULTICLASS:
            X, y = make_classification(n_samples=self.n_samples,
                                       n_features=self.n_features,
                                       n_classes=5,
                                       random_state=randint())
        else:
            X, y = make_regression(n_samples=self.n_samples,
                                   n_features=self.n_features,
                                   random_state=randint())
        X = pd.DataFrame(X, columns=[f'c{i}' for i in range(X.shape[1])])

        return X, y
Example #7
0
 def nn(self):
     solver = Choice(['lbfgs', 'sgd', 'adam'])
     return dict(
         cls=MLPClassifier,
         max_iter=Int(500, 5000, step=500),
         activation=Choice(['identity', 'logistic', 'tanh', 'relu']),
         solver=solver,
         learning_rate=Choice(['constant', 'invscaling', 'adaptive']),
         learning_rate_init_stub=Cascade(partial(self._cascade, self._nn_learning_rate_init, 'slvr'), slvr=solver),
         random_state=randint(),
     )
Example #8
0
 def default_gbm(task_):
     est_cls = lightgbm.LGBMRegressor if task_ == const.TASK_REGRESSION else lightgbm.LGBMClassifier
     return est_cls(n_estimators=50,
                    num_leaves=15,
                    max_depth=5,
                    subsample=0.5,
                    subsample_freq=1,
                    colsample_bytree=0.8,
                    reg_alpha=1,
                    reg_lambda=1,
                    importance_type='gain',
                    random_state=randint(),
                    verbose=-1)
Example #9
0
    def lr(self):
        iters = [1000]
        while iters[-1] < 9000:
            iters.append(int(round(iters[-1] * 1.25, -2)))

        solver = Choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
        penalty = Cascade(partial(self._cascade, self._lr_penalty_fn, 'slvr'), slvr=solver)
        l1_ratio = Cascade(partial(self._cascade, self._lr_l1_ratio, 'penalty'), penalty=penalty)

        return dict(
            cls=LogisticRegression,
            max_iter=Choice(iters),
            solver=solver,
            penalty_stub=penalty,
            l1_ratio_stub=l1_ratio,
            random_state=randint(),
        )
Example #10
0
 def __init__(self,
              remove_shift_variable=True,
              variable_shift_threshold=0.7,
              variable_shift_scorer=None,
              auc_threshold=0.55,
              min_features=10,
              remove_size=0.1,
              sample_balance=True,
              max_test_samples=None,
              cv=5,
              random_state=None,
              callbacks=None):
     self.remove_shift_variable = remove_shift_variable
     self.variable_shift_threshold = variable_shift_threshold
     self.variable_shift_scorer = variable_shift_scorer
     self.auc_threshold = auc_threshold
     self.min_features = min_features
     self.remove_size = remove_size
     self.sample_balance = sample_balance
     self.max_test_samples = max_test_samples
     self.cv = cv
     self.random_state = random_state if random_state is not None else randint(
     )
     self.callbacks = callbacks
Example #11
0
 def default_rf(task_):
     from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
     est_cls = RandomForestRegressor if task_ == const.TASK_REGRESSION else RandomForestClassifier
     return est_cls(min_samples_leaf=20, min_impurity_decrease=0.01, random_state=randint())
Example #12
0
 def default_dt(task_):
     from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
     est_cls = DecisionTreeRegressor if task_ == const.TASK_REGRESSION else DecisionTreeClassifier
     return est_cls(min_samples_leaf=20, min_impurity_decrease=0.01, random_state=randint())