def parse_data_args(dtrain, X_train, y_train, dtrain_2, X_train_2, y_train_2, split_size=0.1, random_state=0): if dtrain is None: if X_train is None or y_train is None: raise RuntimeError('no train data given') cat_cols = [] else: if X_train is not None: raise RuntimeError('both dtrain and X_train given') if y_train is not None: raise RuntimeError('both dtrain and y_train given') X_train, y_train, cat_cols = dtrain.X, dtrain.y, dtrain.cat_cols if dtrain_2 is None and (X_train_2 is None or y_train_2 is None): X_train, X_train_2, y_train, y_train_2 = train_test_split( X_train, y_train, test_size=split_size, random_state=random_state) dtrain = XYCDataset(X_train, y_train, cat_cols) if dtrain_2 is None: dtrain_2 = XYCDataset(X_train_2, y_train_2, cat_cols) else: if X_train_2 is not None: raise RuntimeError('both dtrain_2 and X_train_2 given') if y_train_2 is not None: raise RuntimeError('both dtrain_2 and y_train_2 given') return dtrain, dtrain_2
def test_cat_preprocess_cv(): X, y = make_classification(n_samples=200, n_features=20, n_informative=10, n_classes=2) dataset = XYCDataset(X, y) cv = dataset.cv_split(4) cv = cat_preprocess_cv(cv, one_hot_max_size=5) assert len(cv) == 4 assert len(cv[0][0].cat_cols) == 0 and len(cv[0][1].cat_cols) == 0
def test_advanced_pipeline_biclass(trainer_class): try: x, y = make_classification(n_samples=200, n_features=20, n_informative=10, n_classes=2) directory = '/tmp/local_dir' tracker = LocalTracker(directory) ctb_model_space = ModelSpace( CtBClassifier, { 'learning_rate': hp.loguniform('learning_rate', -5, -1), 'iterations': 10 }) trainer = trainer_class([ XGBClassifier, LGBMClassifier, RFClassifier, ctb_model_space, EnsembleClassifier ], tracker=tracker) dataset = XYCDataset(x, y) trainer.crossval_optimize_params(Accuracy(), dataset, opt_evals=3, metrics=[RocAuc()]) trainer.get_best_results() assert os.listdir(directory) except Exception as e: try: shutil.rmtree(directory) except Exception as _: pass raise e
def test_basic_pipeline_regression(trainer_class): x, y = make_regression(n_samples=200, n_features=20, n_informative=10, n_targets=1) xgb_model_space = ModelSpace(XGBRegressor, {'n_estimators': 15}, name='XGB') ctb_model_space = ModelSpace( CtBRegressor, { 'learning_rate': hp.loguniform('learning_rate', -5, -1), 'iterations': 10 }) results = train_ensemble_model( [xgb_model_space, LGBMClassifier, ctb_model_space], EnsembleRegressor, metric=Mse(), dtrain=XYCDataset(x, y), split_size=0.2, random_state=0, base_trainer=trainer_class, ensemble_trainer=trainer_class, base_trainer_kwargs={'opt_evals': 3}, ensemble_trainer_kwargs={'opt_evals': 3}) assert 'final_model' in results assert 'base_training' in results assert 'ensemble_training' in results assert isinstance(results['final_model'], EnsembleRegressor)
def assess_model(metrics, model, train_objects, train_target): """Evaluates provided metrics for a trained model.""" model_results = [] prediction = model.predict(XYCDataset(train_objects)) for metric in metrics: metric_function = get_metric_function(metric) score = metric_function(train_target, prediction) model_results.append(np.asscalar(score)) return model_results
def test_basic_pipeline_biclass_with_cat_preprocess_mask(trainer_class): x, y = make_classification(n_samples=200, n_features=20, n_informative=10, n_classes=2) trainer = trainer_class([XGBClassifier, LGBMClassifier, RFClassifier]) dataset = XYCDataset(x, y) trainer.crossval_optimize_params(Accuracy(), dataset, opt_evals=3, cat_preprocess=[False, False, True]) trainer.get_best_results()
def test_split(): objects = 20 features = 5 X = np.arange(objects * features).reshape( (objects, features)).astype(float) y = np.arange(objects) cat_cols = [1, 2] dataset = XYCDataset(X, y, cat_cols) splitted_dataset = dataset.split(6) assert len(splitted_dataset) == 6 for i, ds in enumerate(splitted_dataset): if i < 2: assert len(ds.X) == 4 assert len(ds.y) == 4 else: assert len(ds.X) == 3 assert len(ds.y) == 3 assert ds.cat_cols == cat_cols ds.cat_cols[0] = 3 assert ds.cat_cols != cat_cols
def test_cv_split(): objects = 20 features = 5 X = np.arange(objects * features).reshape( (objects, features)).astype(float) y = np.arange(objects) cat_cols = [1, 2] dataset = XYCDataset(X, y, cat_cols) cv_splits = dataset.cv_split(4) assert len(cv_splits) == 4 for split in cv_splits: assert len(split) == 2 assert split[0].cat_cols == cat_cols assert split[1].cat_cols == cat_cols assert len(split[0].X) == 15 assert len(split[0].y) == 15 assert len(split[1].X) == 5 assert len(split[1].y) == 5 for split in cv_splits: split[0].cat_cols[:] = [] assert split[1].cat_cols != split[0].cat_cols assert split[0].cat_cols != cat_cols
def test_basic_pipeline_regression(trainer_class): x, y = make_regression(n_samples=200, n_features=20, n_informative=10, n_targets=1) xgb_model_space = ModelSpace(XGBRegressor, {'n_estimators': 15}, name='XGB') ctb_model_space = ModelSpace( CtBRegressor, { 'learning_rate': hp.loguniform('learning_rate', -5, -1), 'iterations': 10 }) trainer = trainer_class( [LGBMRegressor, xgb_model_space, ctb_model_space, EnsembleRegressor]) dataset = XYCDataset(x, y) trainer.crossval_optimize_params(Mse(), dataset, opt_evals=3) results = trainer.get_best_results() assert results['XGB']['result']['params']['n_estimators'] == 15
def test_basic_pipeline_biclass(trainer_class): x, y = make_classification(n_samples=200, n_features=20, n_informative=10, n_classes=2) ctb_model_space = ModelSpace( CtBClassifier, { 'learning_rate': hp.loguniform('learning_rate', -5, -1), 'iterations': 10 }) trainer = trainer_class([ XGBClassifier, LGBMClassifier, RFClassifier, ctb_model_space, EnsembleClassifier ]) dataset = XYCDataset(x, y) trainer.crossval_optimize_params(Accuracy(), dataset, opt_evals=3) trainer.get_best_results()
def _convert_to_dataset(self, data, label=None, cat_cols=None): return XYCDataset(data, label, cat_cols)
def process_job(job): try: input = json.loads(job.input) out_folder = Path(input["model_path"]).parent data_path = afs_download(input["data_path"]) model_path = afs_download(input["model_path"]) train_target, train_objects = load_data(data_path) dataset = XYCDataset(train_objects, train_target) with open(model_path, "rb") as f: models = json.load(f) cv = dataset.cv_split(models['cv']) for model in models['models']: if MODEL_CLASSES.get(model['type']): metrics = [ PREDEFINED_METRICS.get(metric)() for metric in models['metrics'] ] result = crossval_fit_eval( model_type=MODEL_CLASSES[model['type']], params=model.get('params'), cv=cv, metrics=[ PREDEFINED_METRICS.get(metric)() for metric in models['metrics'] ], verbose=False) res_model = MODEL_CLASSES[model['type']]( params=model.get('params')).fit(dataset) else: raise NotImplementedError('Classifier %s is not supported') result["status"] = STATUS_OK losses = [ cv_result[metrics[-1].name] for cv_result in result["metric_cv_results"] ] result["loss_variance"] = np.std(losses) if model['type'] in (MODEL_CLASSES['CtBClassifier'], MODEL_CLASSES['CtBRegressor']): cleanup_catboost() afs_upload(pickle.dumps(res_model), out_folder / 'model.pickle') afs_upload(json.dumps(result).encode(), out_folder / 'output.json') job.output = json.dumps({ 'output': str(out_folder / 'output.json'), 'result_model_path': str(out_folder / 'model.pickle') }) job.status = Job.COMPLETED CONFIG["stub"].ModifyJob(job) except Exception as exc: logging.warning(str(exc)) traceback.print_exc() log = "Error:\n" + str(exc) + "\n\n\nTrace:\n" + traceback.format_exc() afs_upload(str(log).encode(), out_folder / 'error.log') job.output = json.dumps({'error': str(out_folder / 'error.log')}) job.status = Job.FAILED CONFIG["stub"].ModifyJob(job) return return
CLASSIFIERS, ['CtBClassifier', 'XGBClassifier', 'RFClassifier'], )) class Case(object): def __init__(self, dataset, expected_accuracy): self.dataset = dataset self.expected_accuracy = expected_accuracy basic_X, basic_y = make_classification(n_samples=20, n_features=4, n_informative=2, n_classes=2) basic_XYCDataset = XYCDataset(basic_X, basic_y) basic_DataFrame = DataFrame(data=basic_X) basic_DataFrame['y'] = basic_y path = 'basic_dataset.csv' @pytest.fixture(scope='session') def basic_dataset_path(tmpdir_factory): filename = str(tmpdir_factory.mktemp('data').join(path)) basic_DataFrame.to_csv(filename) return filename def models(classifier): return ModelSpace( classifier,
def train_ensemble_model(model_spaces, ensemble_model, metric, dtrain=None, X_train=None, y_train=None, dtrain_2=None, X_train_2=None, y_train_2=None, split_size=0.2, random_state=0, base_trainer=None, base_trainer_kwargs={}, ensemble_trainer=None, ensemble_trainer_kwargs={}, save_dir=None, base_tracker=None, ensemble_tracker=None, add_meta_features=False, ensemble_model_params={}): """ Args: model_spaces (list of modelgym.models.Model or modelgym.utils.ModelSpaces): list of model spaces (model classes and parameter spaces to look in). If some list item is Model, it is converted in ModelSpace with default space and name equal to model class __name__ ensemble_model: one of modelgym.models.Model metric (modelgym.metrics.Metric): metric to optimize dtrain (modelgym.utils.XYCDataset or None): dataset X_train (np.array(n_samples, n_features)): instead of dtrain y_train (np.array(n_samples)): labels # if no *_2 given - split train given above dtrain_2 (modelgym.utils.XYCDataset or None): dataset to fit second layer X_train_2 (np.array(n_samples, n_features)): instead of dtrain_2 y_train_2 (np.array(n_samples)): labels to train second layer split_size (float 0..1): split train if no train_2 given random_state (int): random state to split base_trainer (one of modelgym.trainers or None): trainer to train base models RandomTrainer uses if None base_trainer_kwargs (dict): kwargs to pass to base_trainer.crossval_optimize_params ensemble_trainer (one of modelgym.trainers or None): trainer to train second layer model RandomTrainer uses if None ensemble_trainer_kwargs (dict): kwargs to pass to ensemble_trainer.crossval_optimize_params save_dir (str or None): directory to track base_tracker (modelgym.trackers or None): tracker for base models ensemble_tracker (modelgym.trackers or None): tracker for second layer model add_meta_features (bool, default=False): add predictions of base models to train_2 e.x. make stacking (otherwise - weighted ensemble) ensemble_model_params (dict): params for last layer model Return: dict: { 'base_training': base_results, # results of base models trainer 'ensemble_training': ensemble_results, # results of ensemble model trainer # model instance with all params 'final_model': ensemble_model(ensemble_results[ensemble_model.__name__]['result']['params']) } """ dtrain, dtrain_2 = parse_data_args(dtrain, X_train, y_train, dtrain_2, X_train_2, y_train_2, split_size=split_size, random_state=random_state) if save_dir is not None: if base_tracker is None: base_tracker = LocalTracker(save_dir, 'base_models') if ensemble_tracker is None: ensemble_tracker = LocalTracker(save_dir, 'ensemble_model') if base_trainer is None: base_trainer = RandomTrainer(model_spaces, tracker=base_tracker) else: base_trainer = base_trainer(model_spaces, tracker=base_tracker) base_trainer.crossval_optimize_params(metric, dtrain, **base_trainer_kwargs) base_results = base_trainer.get_best_results() trained_models = [] for model, space in base_trainer.model_spaces.items(): trained_models.append( space.model_class(base_results[model]['result']['params'])) if add_meta_features: X_train_2, y_train_2, cat_cols = dtrain_2.X, dtrain_2.y, dtrain_2.cat_cols meta_features = np.zeros((len(X_train_2), len(model_spaces))) if not isinstance(X_train_2, np.ndarray): X_train_2 = np.array(X_train_2) for i, model in enumerate(trained_models): model.fit(dtrain) meta_features[:, i] = model.predict(dtrain_2) X_train_2 = np.concatenate([X_train_2, meta_features], axis=1) dtrain_2 = XYCDataset(X_train_2, y_train_2, cat_cols) params = ensemble_model_params else: params = { 'weight_{}'.format(i): hp.uniform('weight_{}'.format(i), 0, 1) for i, _ in enumerate(model_spaces) } params['models'] = trained_models ensemble_model_space = ModelSpace(ensemble_model, params) if ensemble_trainer is None: ensemble_trainer = RandomTrainer(ensemble_model_space, tracker=ensemble_tracker) else: ensemble_trainer = ensemble_trainer(ensemble_model_space, tracker=ensemble_tracker) ensemble_trainer.crossval_optimize_params(metric, dtrain_2, **ensemble_trainer_kwargs) ensemble_results = ensemble_trainer.get_best_results() return { 'base_training': base_results, 'ensemble_training': ensemble_results, 'final_model': ensemble_model( ensemble_results[ensemble_model.__name__]['result']['params']) }