def fully_connected_learner(data, rounds=20):
    """Neural Network Classifier -> trained for 'rounds' times"""
    auca = []
    acca = []
    precs = []
    sens = []

    for _ in range(rounds):
        # train
        learn = tabular_learner(data,
                                layers=[200, 100, 100],
                                metrics=accuracy,
                                callback_fns=[OverSamplingCallback])
        learn.fit_one_cycle(10, max_lr=1e-3)
        interp = ClassificationInterpretation.from_learner(
            learn, DatasetType.Test)
        y_test = interp.y_true

        # Get accuracy
        p_pred = interp.preds[:, 1]
        loc_auc = metrics.roc_auc_score(y_test, p_pred)
        loc_acc = get_acc(interp)
        cm = interp.confusion_matrix()
        precision = cm[0][0] / (cm[0][0] + cm[0][1])
        sensitifity = cm[1][1] / (cm[1][0] + cm[1][1])

        acca.append(loc_acc)
        auca.append(loc_auc)
        precs.append(precision)
        sens.append(sensitifity)

    print_results(auca, acca, sens, precs)
    interp = ClassificationInterpretation.from_learner(learn, DatasetType.Test)
    return interp
Exemple #2
0
def fastai_model():
    iris = datasets.load_iris()
    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
    y = pd.Series(iris.target, name="label")
    data = (TabularList.from_df(
        pd.concat([X, y], axis=1),
        cont_names=list(X.columns)).split_by_rand_pct(
            valid_pct=0.1, seed=42).label_from_df(cols="label").databunch())
    model = tabular_learner(data, metrics=accuracy, layers=[3])
    model.fit(1)
    return ModelWithData(model=model, inference_dataframe=X)
def fastai_model(data, **kwargs):
    return tabular_learner(data, metrics=accuracy, layers=[5, 3, 2], **kwargs)
    def _fit(self, X_train, y_train, X_val, y_val, time_limit=None, **kwargs):
        try_import_fastai_v1()
        from fastai.layers import LabelSmoothingCrossEntropy
        from fastai.tabular import tabular_learner
        from fastai.utils.mod_display import progress_disabled_ctx
        from .callbacks import EarlyStoppingCallbackWithTimeLimit, SaveModelCallback

        start_time = time.time()

        self.y_scaler = self.params.get('y_scaler', None)
        if self.y_scaler is not None:
            self.y_scaler = copy.deepcopy(self.y_scaler)

        logger.log(15,
                   f'Fitting Neural Network with parameters {self.params}...')
        data = self.preprocess_train(X_train, y_train, X_val, y_val)

        nn_metric, objective_func_name = self.__get_objective_func_name()
        objective_func_name_to_monitor = self.__get_objective_func_to_monitor(
            objective_func_name)
        objective_optim_mode = 'min' if objective_func_name in [
            'root_mean_squared_error',
            'mean_squared_error',
            'mean_absolute_error',
            'r2'  # Regression objectives
        ] else 'auto'

        # TODO: calculate max emb concat layer size and use 1st layer as that value and 2nd in between number of classes and the value
        if self.params.get('layers', None) is not None:
            layers = self.params['layers']
        elif self.problem_type in [REGRESSION, BINARY]:
            layers = [200, 100]
        else:
            base_size = max(len(data.classes) * 2, 100)
            layers = [base_size * 2, base_size]

        loss_func = None
        if self.problem_type in [BINARY, MULTICLASS
                                 ] and self.params.get('smoothing', 0.0) > 0.0:
            loss_func = LabelSmoothingCrossEntropy(self.params['smoothing'])

        ps = self.params['ps']
        if type(ps) != list:
            ps = [ps]

        if time_limit:
            time_elapsed = time.time() - start_time
            time_left = time_limit - time_elapsed
        else:
            time_left = None

        early_stopping_fn = partial(
            EarlyStoppingCallbackWithTimeLimit,
            monitor=objective_func_name_to_monitor,
            mode=objective_optim_mode,
            min_delta=self.params['early.stopping.min_delta'],
            patience=self.params['early.stopping.patience'],
            time_limit=time_left)

        self.model = tabular_learner(data,
                                     layers=layers,
                                     ps=ps,
                                     emb_drop=self.params['emb_drop'],
                                     metrics=nn_metric,
                                     loss_func=loss_func,
                                     callback_fns=[early_stopping_fn])
        logger.log(15, self.model.model)

        with make_temp_directory() as temp_dir:
            save_callback = SaveModelCallback(
                self.model,
                monitor=objective_func_name_to_monitor,
                mode=objective_optim_mode,
                name=self.name)
            with progress_disabled_ctx(self.model) as model:
                original_path = model.path
                model.path = Path(temp_dir)
                model.fit_one_cycle(self.params['epochs'],
                                    self.params['lr'],
                                    callbacks=save_callback)

                # Load the best one and export it
                model.load(self.name)

                if objective_func_name == 'log_loss':
                    eval_result = model.validate()[0]
                else:
                    eval_result = model.validate()[1].numpy().reshape(-1)[0]

                logger.log(15, f'Model validation metrics: {eval_result}')
                model.path = original_path
Exemple #5
0
    def _fit(self,
             X,
             y,
             X_val=None,
             y_val=None,
             time_limit=None,
             num_cpus=None,
             num_gpus=0,
             sample_weight=None,
             **kwargs):
        try_import_fastai_v1()
        import torch
        from fastai.layers import LabelSmoothingCrossEntropy
        from fastai.tabular import tabular_learner
        from fastai.utils.mod_display import progress_disabled_ctx
        from fastai.core import defaults
        from .callbacks import EarlyStoppingCallbackWithTimeLimit, SaveModelCallback

        start_time = time.time()
        if sample_weight is not None:  # TODO: support
            logger.log(
                15,
                "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training."
            )

        params = self.params.copy()

        self.y_scaler = params.get('y_scaler', None)
        if self.y_scaler is not None:
            self.y_scaler = copy.deepcopy(self.y_scaler)

        if num_cpus is None:
            num_cpus = defaults.cpus
        # additional workers are helping only when fork is enabled; in other mp modes, communication overhead reduces performance
        num_workers = int(num_cpus / 2)
        if not is_fork_enabled():
            num_workers = 0
        if num_gpus is not None:
            if num_gpus == 0:
                # TODO: Does not obviously impact inference speed
                defaults.device = torch.device('cpu')
            else:
                defaults.device = torch.device('cuda')

        logger.log(15, f'Fitting Neural Network with parameters {params}...')
        data = self._preprocess_train(X,
                                      y,
                                      X_val,
                                      y_val,
                                      num_workers=num_workers)

        nn_metric, objective_func_name = self.__get_objective_func_name()
        objective_func_name_to_monitor = self.__get_objective_func_to_monitor(
            objective_func_name)
        objective_optim_mode = 'min' if objective_func_name in [
            'root_mean_squared_error',
            'mean_squared_error',
            'mean_absolute_error',
            'r2'  # Regression objectives
        ] else 'auto'

        # TODO: calculate max emb concat layer size and use 1st layer as that value and 2nd in between number of classes and the value
        if params.get('layers', None) is not None:
            layers = params['layers']
        elif self.problem_type in [REGRESSION, BINARY]:
            layers = [200, 100]
        else:
            base_size = max(len(data.classes) * 2, 100)
            layers = [base_size * 2, base_size]

        loss_func = None
        if self.problem_type in [BINARY, MULTICLASS
                                 ] and params.get('smoothing', 0.0) > 0.0:
            loss_func = LabelSmoothingCrossEntropy(params['smoothing'])

        ps = params['ps']
        if type(ps) != list:
            ps = [ps]

        if time_limit:
            time_elapsed = time.time() - start_time
            time_left = time_limit - time_elapsed
        else:
            time_left = None

        best_epoch_stop = params.get("best_epoch",
                                     None)  # Use best epoch for refit_full.
        early_stopping_fn = partial(
            EarlyStoppingCallbackWithTimeLimit,
            monitor=objective_func_name_to_monitor,
            mode=objective_optim_mode,
            min_delta=params['early.stopping.min_delta'],
            patience=params['early.stopping.patience'],
            time_limit=time_left,
            best_epoch_stop=best_epoch_stop)

        self.model = tabular_learner(data,
                                     layers=layers,
                                     ps=ps,
                                     emb_drop=params['emb_drop'],
                                     metrics=nn_metric,
                                     loss_func=loss_func,
                                     callback_fns=[early_stopping_fn])
        logger.log(15, self.model.model)

        with make_temp_directory() as temp_dir:
            save_callback = SaveModelCallback(
                self.model,
                monitor=objective_func_name_to_monitor,
                mode=objective_optim_mode,
                name=self.name,
                best_epoch_stop=best_epoch_stop)
            with progress_disabled_ctx(self.model) as model:
                original_path = model.path
                model.path = Path(temp_dir)
                model.fit_one_cycle(params['epochs'],
                                    params['lr'],
                                    callbacks=save_callback)

                # Load the best one and export it
                model.load(self.name)

                if objective_func_name == 'log_loss':
                    eval_result = model.validate()[0]
                else:
                    eval_result = model.validate()[1].numpy().reshape(-1)[0]

                logger.log(15, f'Model validation metrics: {eval_result}')
                model.path = original_path
            self.params_trained['best_epoch'] = save_callback.best_epoch
Exemple #6
0
def get_new_model_and_pred(train: pd.DataFrame,
                           valid: pd.DataFrame,
                           path: Path = MODELS_PATH) -> Tuple[Learner, float]:
    """Take new train and validation dataframes, re-run the model, and return
    the model and its root mean squared percentage error.

    Input: the train dataframe, the validation dataframe, and the path for the
    models to be saved.
    Output: the model (ready to save if better than the old one) and its rmspe.
    """

    # Sort the train/valid sets and stick em together
    train.sort_index(inplace=True)
    valid.sort_index(inplace=True)
    df = train.append(valid).copy()

    # We'll need to know how many items in our validation set later
    n_valid = len(valid[valid.sales != 0])

    # Preprocessing
    df = preprocess.preprocess(df)
    inner_args = preprocess.gather_args(df)

    # Create a databunch by starting with a TabularList and applying the usual
    # transformations
    data = (TabularList.from_df(df,
                                path=path,
                                cat_names=inner_args['cat_names'],
                                cont_names=inner_args['cont_names'],
                                procs=inner_args['procs']))

    n_items = len(data.items)

    # Since we sorted by index and appended, our validation set is just the
    # n_valid highest items in our list
    data = data.split_by_valid_func(lambda i: i >= n_items - n_valid)
    data = data.label_from_df(cols=inner_args['dep_var'],
                              label_cls=FloatList,
                              log=True)
    data = data.databunch()

    # Create a learner
    # Let's construct the learner from scratch here, in case we want to change
    # the architecture later (we can and should - this is very basic)
    learn = tabular_learner(
        data,
        layers=[100, 100],
        ps=[0.001, 0.01],
        emb_drop=0.01,
        metrics=exp_rmspe,
        y_range=None,
        callback_fns=[
            partial(callbacks.tracker.TrackerCallback, monitor='exp_rmspe'),
            partial(callbacks.tracker.EarlyStoppingCallback,
                    mode='min',
                    monitor='exp_rmspe',
                    min_delta=0.01,
                    patience=0),
            partial(callbacks.tracker.SaveModelCallback,
                    monitor='exp_rmspe',
                    mode='min',
                    every='improvement',
                    name=datetime.now().strftime("%Y-%m-%d-%X"))
        ])

    # Since repeated model runs showed us that 1e-3 was a good maximum learning
    # rate for this model and since we're doing a no-human-intervention run,
    # we'll use 1e-3 for this model. While this model is in place, we can run
    # some offline tests as needed to see whether the maximum learning rate
    # should be changed, but in most cases the 1e-3 is probably good, even if
    # the model changes (again, we can test offline and update if needed).

    # Also, since we have the early-stopping callback with the save-model
    # callback set to 'every=improvement', we'll run 10 cycles even though we
    # probably won't need nearly that many
    learn.fit_one_cycle(cyc_len=10, max_lr=1e-3)

    # Get our predictions from the model and calculate rmspe
    log_preds, log_reals = learn.get_preds(ds_type=DatasetType.Valid)
    preds = np.exp(log_preds).flatten()
    reals = np.exp(log_reals)
    new_rmspe = rmspe(preds, reals)
    return (learn, new_rmspe)