コード例 #1
0
def parse_data_args(dtrain,
                    X_train,
                    y_train,
                    dtrain_2,
                    X_train_2,
                    y_train_2,
                    split_size=0.1,
                    random_state=0):
    if dtrain is None:
        if X_train is None or y_train is None:
            raise RuntimeError('no train data given')
        cat_cols = []
    else:
        if X_train is not None:
            raise RuntimeError('both dtrain and X_train given')
        if y_train is not None:
            raise RuntimeError('both dtrain and y_train given')

        X_train, y_train, cat_cols = dtrain.X, dtrain.y, dtrain.cat_cols

    if dtrain_2 is None and (X_train_2 is None or y_train_2 is None):
        X_train, X_train_2, y_train, y_train_2 = train_test_split(
            X_train, y_train, test_size=split_size, random_state=random_state)
    dtrain = XYCDataset(X_train, y_train, cat_cols)

    if dtrain_2 is None:
        dtrain_2 = XYCDataset(X_train_2, y_train_2, cat_cols)
    else:
        if X_train_2 is not None:
            raise RuntimeError('both dtrain_2 and X_train_2 given')
        if y_train_2 is not None:
            raise RuntimeError('both dtrain_2 and y_train_2 given')

    return dtrain, dtrain_2
コード例 #2
0
def test_cat_preprocess_cv():
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=10,
                               n_classes=2)
    dataset = XYCDataset(X, y)
    cv = dataset.cv_split(4)
    cv = cat_preprocess_cv(cv, one_hot_max_size=5)

    assert len(cv) == 4
    assert len(cv[0][0].cat_cols) == 0 and len(cv[0][1].cat_cols) == 0
コード例 #3
0
def test_advanced_pipeline_biclass(trainer_class):
    try:
        x, y = make_classification(n_samples=200,
                                   n_features=20,
                                   n_informative=10,
                                   n_classes=2)
        directory = '/tmp/local_dir'
        tracker = LocalTracker(directory)

        ctb_model_space = ModelSpace(
            CtBClassifier, {
                'learning_rate': hp.loguniform('learning_rate', -5, -1),
                'iterations': 10
            })

        trainer = trainer_class([
            XGBClassifier, LGBMClassifier, RFClassifier, ctb_model_space,
            EnsembleClassifier
        ],
                                tracker=tracker)
        dataset = XYCDataset(x, y)
        trainer.crossval_optimize_params(Accuracy(),
                                         dataset,
                                         opt_evals=3,
                                         metrics=[RocAuc()])
        trainer.get_best_results()

        assert os.listdir(directory)
    except Exception as e:
        try:
            shutil.rmtree(directory)
        except Exception as _:
            pass
        raise e
コード例 #4
0
def test_basic_pipeline_regression(trainer_class):
    x, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=10,
                           n_targets=1)

    xgb_model_space = ModelSpace(XGBRegressor, {'n_estimators': 15},
                                 name='XGB')
    ctb_model_space = ModelSpace(
        CtBRegressor, {
            'learning_rate': hp.loguniform('learning_rate', -5, -1),
            'iterations': 10
        })

    results = train_ensemble_model(
        [xgb_model_space, LGBMClassifier, ctb_model_space],
        EnsembleRegressor,
        metric=Mse(),
        dtrain=XYCDataset(x, y),
        split_size=0.2,
        random_state=0,
        base_trainer=trainer_class,
        ensemble_trainer=trainer_class,
        base_trainer_kwargs={'opt_evals': 3},
        ensemble_trainer_kwargs={'opt_evals': 3})
    assert 'final_model' in results
    assert 'base_training' in results
    assert 'ensemble_training' in results
    assert isinstance(results['final_model'], EnsembleRegressor)
コード例 #5
0
def assess_model(metrics, model, train_objects, train_target):
    """Evaluates provided metrics for a trained model."""
    model_results = []
    prediction = model.predict(XYCDataset(train_objects))

    for metric in metrics:
        metric_function = get_metric_function(metric)
        score = metric_function(train_target, prediction)
        model_results.append(np.asscalar(score))

    return model_results
コード例 #6
0
def test_basic_pipeline_biclass_with_cat_preprocess_mask(trainer_class):
    x, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=10,
                               n_classes=2)
    trainer = trainer_class([XGBClassifier, LGBMClassifier, RFClassifier])
    dataset = XYCDataset(x, y)
    trainer.crossval_optimize_params(Accuracy(),
                                     dataset,
                                     opt_evals=3,
                                     cat_preprocess=[False, False, True])
    trainer.get_best_results()
コード例 #7
0
def test_split():
    objects = 20
    features = 5
    X = np.arange(objects * features).reshape(
        (objects, features)).astype(float)
    y = np.arange(objects)
    cat_cols = [1, 2]
    dataset = XYCDataset(X, y, cat_cols)
    splitted_dataset = dataset.split(6)
    assert len(splitted_dataset) == 6

    for i, ds in enumerate(splitted_dataset):
        if i < 2:
            assert len(ds.X) == 4
            assert len(ds.y) == 4
        else:
            assert len(ds.X) == 3
            assert len(ds.y) == 3

        assert ds.cat_cols == cat_cols
        ds.cat_cols[0] = 3
        assert ds.cat_cols != cat_cols
コード例 #8
0
def test_cv_split():
    objects = 20
    features = 5
    X = np.arange(objects * features).reshape(
        (objects, features)).astype(float)
    y = np.arange(objects)
    cat_cols = [1, 2]
    dataset = XYCDataset(X, y, cat_cols)
    cv_splits = dataset.cv_split(4)
    assert len(cv_splits) == 4
    for split in cv_splits:
        assert len(split) == 2
        assert split[0].cat_cols == cat_cols
        assert split[1].cat_cols == cat_cols
        assert len(split[0].X) == 15
        assert len(split[0].y) == 15
        assert len(split[1].X) == 5
        assert len(split[1].y) == 5

    for split in cv_splits:
        split[0].cat_cols[:] = []
        assert split[1].cat_cols != split[0].cat_cols
        assert split[0].cat_cols != cat_cols
コード例 #9
0
def test_basic_pipeline_regression(trainer_class):
    x, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=10,
                           n_targets=1)
    xgb_model_space = ModelSpace(XGBRegressor, {'n_estimators': 15},
                                 name='XGB')
    ctb_model_space = ModelSpace(
        CtBRegressor, {
            'learning_rate': hp.loguniform('learning_rate', -5, -1),
            'iterations': 10
        })
    trainer = trainer_class(
        [LGBMRegressor, xgb_model_space, ctb_model_space, EnsembleRegressor])
    dataset = XYCDataset(x, y)
    trainer.crossval_optimize_params(Mse(), dataset, opt_evals=3)
    results = trainer.get_best_results()
    assert results['XGB']['result']['params']['n_estimators'] == 15
コード例 #10
0
def test_basic_pipeline_biclass(trainer_class):
    x, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=10,
                               n_classes=2)

    ctb_model_space = ModelSpace(
        CtBClassifier, {
            'learning_rate': hp.loguniform('learning_rate', -5, -1),
            'iterations': 10
        })

    trainer = trainer_class([
        XGBClassifier, LGBMClassifier, RFClassifier, ctb_model_space,
        EnsembleClassifier
    ])
    dataset = XYCDataset(x, y)
    trainer.crossval_optimize_params(Accuracy(), dataset, opt_evals=3)
    trainer.get_best_results()
コード例 #11
0
 def _convert_to_dataset(self, data, label=None, cat_cols=None):
     return XYCDataset(data, label, cat_cols)
コード例 #12
0
def process_job(job):
    try:
        input = json.loads(job.input)
        out_folder = Path(input["model_path"]).parent
        data_path = afs_download(input["data_path"])
        model_path = afs_download(input["model_path"])
        train_target, train_objects = load_data(data_path)
        dataset = XYCDataset(train_objects, train_target)
        with open(model_path, "rb") as f:
            models = json.load(f)
        cv = dataset.cv_split(models['cv'])

        for model in models['models']:
            if MODEL_CLASSES.get(model['type']):
                metrics = [
                    PREDEFINED_METRICS.get(metric)()
                    for metric in models['metrics']
                ]
                result = crossval_fit_eval(
                    model_type=MODEL_CLASSES[model['type']],
                    params=model.get('params'),
                    cv=cv,
                    metrics=[
                        PREDEFINED_METRICS.get(metric)()
                        for metric in models['metrics']
                    ],
                    verbose=False)

                res_model = MODEL_CLASSES[model['type']](
                    params=model.get('params')).fit(dataset)
            else:
                raise NotImplementedError('Classifier %s is not supported')

        result["status"] = STATUS_OK
        losses = [
            cv_result[metrics[-1].name]
            for cv_result in result["metric_cv_results"]
        ]
        result["loss_variance"] = np.std(losses)

        if model['type'] in (MODEL_CLASSES['CtBClassifier'],
                             MODEL_CLASSES['CtBRegressor']):
            cleanup_catboost()

        afs_upload(pickle.dumps(res_model), out_folder / 'model.pickle')
        afs_upload(json.dumps(result).encode(), out_folder / 'output.json')

        job.output = json.dumps({
            'output':
            str(out_folder / 'output.json'),
            'result_model_path':
            str(out_folder / 'model.pickle')
        })

        job.status = Job.COMPLETED
        CONFIG["stub"].ModifyJob(job)
    except Exception as exc:
        logging.warning(str(exc))
        traceback.print_exc()
        log = "Error:\n" + str(exc) + "\n\n\nTrace:\n" + traceback.format_exc()
        afs_upload(str(log).encode(), out_folder / 'error.log')
        job.output = json.dumps({'error': str(out_folder / 'error.log')})
        job.status = Job.FAILED
        CONFIG["stub"].ModifyJob(job)
        return

    return
コード例 #13
0
        CLASSIFIERS,
        ['CtBClassifier', 'XGBClassifier', 'RFClassifier'],
    ))


class Case(object):
    def __init__(self, dataset, expected_accuracy):
        self.dataset = dataset
        self.expected_accuracy = expected_accuracy


basic_X, basic_y = make_classification(n_samples=20,
                                       n_features=4,
                                       n_informative=2,
                                       n_classes=2)
basic_XYCDataset = XYCDataset(basic_X, basic_y)
basic_DataFrame = DataFrame(data=basic_X)
basic_DataFrame['y'] = basic_y
path = 'basic_dataset.csv'


@pytest.fixture(scope='session')
def basic_dataset_path(tmpdir_factory):
    filename = str(tmpdir_factory.mktemp('data').join(path))
    basic_DataFrame.to_csv(filename)
    return filename


def models(classifier):
    return ModelSpace(
        classifier,
コード例 #14
0
def train_ensemble_model(model_spaces,
                         ensemble_model,
                         metric,
                         dtrain=None,
                         X_train=None,
                         y_train=None,
                         dtrain_2=None,
                         X_train_2=None,
                         y_train_2=None,
                         split_size=0.2,
                         random_state=0,
                         base_trainer=None,
                         base_trainer_kwargs={},
                         ensemble_trainer=None,
                         ensemble_trainer_kwargs={},
                         save_dir=None,
                         base_tracker=None,
                         ensemble_tracker=None,
                         add_meta_features=False,
                         ensemble_model_params={}):
    """
    Args:
        model_spaces (list of modelgym.models.Model or modelgym.utils.ModelSpaces): list of model spaces
                (model classes and parameter spaces to look in). If some list item is Model, it is
                converted in ModelSpace with default space and name equal to model class __name__
        ensemble_model: one of modelgym.models.Model
        metric (modelgym.metrics.Metric): metric to optimize
        dtrain (modelgym.utils.XYCDataset or None): dataset
        X_train (np.array(n_samples, n_features)): instead of dtrain
        y_train (np.array(n_samples)): labels
        # if no *_2 given - split train given above
        dtrain_2 (modelgym.utils.XYCDataset or None): dataset to fit second layer
        X_train_2 (np.array(n_samples, n_features)): instead of dtrain_2
        y_train_2 (np.array(n_samples)): labels to train second layer
        split_size (float 0..1): split train if no train_2 given
        random_state (int): random state to split
        base_trainer (one of modelgym.trainers or None): trainer to train base models
            RandomTrainer uses if None
        base_trainer_kwargs (dict): kwargs to pass to base_trainer.crossval_optimize_params
        ensemble_trainer (one of modelgym.trainers or None): trainer to train second layer model
            RandomTrainer uses if None
        ensemble_trainer_kwargs (dict): kwargs to pass to ensemble_trainer.crossval_optimize_params
        save_dir (str or None): directory to track
        base_tracker (modelgym.trackers or None): tracker for base models
        ensemble_tracker (modelgym.trackers or None): tracker for second layer model
        add_meta_features (bool, default=False): add predictions of base models to train_2
            e.x. make stacking (otherwise - weighted ensemble)
        ensemble_model_params (dict): params for last layer model
    Return:
        dict: {
        'base_training': base_results,  # results of base models trainer
        'ensemble_training': ensemble_results,  # results of ensemble model trainer
        # model instance with all params
        'final_model': ensemble_model(ensemble_results[ensemble_model.__name__]['result']['params'])
    }
    """
    dtrain, dtrain_2 = parse_data_args(dtrain,
                                       X_train,
                                       y_train,
                                       dtrain_2,
                                       X_train_2,
                                       y_train_2,
                                       split_size=split_size,
                                       random_state=random_state)

    if save_dir is not None:
        if base_tracker is None:
            base_tracker = LocalTracker(save_dir, 'base_models')
        if ensemble_tracker is None:
            ensemble_tracker = LocalTracker(save_dir, 'ensemble_model')

    if base_trainer is None:
        base_trainer = RandomTrainer(model_spaces, tracker=base_tracker)
    else:
        base_trainer = base_trainer(model_spaces, tracker=base_tracker)

    base_trainer.crossval_optimize_params(metric, dtrain,
                                          **base_trainer_kwargs)
    base_results = base_trainer.get_best_results()

    trained_models = []
    for model, space in base_trainer.model_spaces.items():
        trained_models.append(
            space.model_class(base_results[model]['result']['params']))

    if add_meta_features:
        X_train_2, y_train_2, cat_cols = dtrain_2.X, dtrain_2.y, dtrain_2.cat_cols
        meta_features = np.zeros((len(X_train_2), len(model_spaces)))
        if not isinstance(X_train_2, np.ndarray):
            X_train_2 = np.array(X_train_2)
        for i, model in enumerate(trained_models):
            model.fit(dtrain)
            meta_features[:, i] = model.predict(dtrain_2)
        X_train_2 = np.concatenate([X_train_2, meta_features], axis=1)
        dtrain_2 = XYCDataset(X_train_2, y_train_2, cat_cols)

        params = ensemble_model_params
    else:
        params = {
            'weight_{}'.format(i): hp.uniform('weight_{}'.format(i), 0, 1)
            for i, _ in enumerate(model_spaces)
        }
        params['models'] = trained_models

    ensemble_model_space = ModelSpace(ensemble_model, params)

    if ensemble_trainer is None:
        ensemble_trainer = RandomTrainer(ensemble_model_space,
                                         tracker=ensemble_tracker)
    else:
        ensemble_trainer = ensemble_trainer(ensemble_model_space,
                                            tracker=ensemble_tracker)

    ensemble_trainer.crossval_optimize_params(metric, dtrain_2,
                                              **ensemble_trainer_kwargs)
    ensemble_results = ensemble_trainer.get_best_results()

    return {
        'base_training':
        base_results,
        'ensemble_training':
        ensemble_results,
        'final_model':
        ensemble_model(
            ensemble_results[ensemble_model.__name__]['result']['params'])
    }