def test_fit_multichannel(): """Test model building helper function for multi-channel feature data.""" fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.build_model_from_featureset(fset, model) assert isinstance(model, RandomForestClassifier)
def test_fit_multichannel(): """Test model building helper function for multi-channel feature data.""" fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.build_model_from_featureset(fset, model) assert isinstance(model, RandomForestClassifier)
def test_model_regression(): """Test model prediction function: regression""" fset = sample_featureset(10, 1, ["amplitude"], [0.1, 0.5]) model = build_model.build_model_from_featureset(fset, model_type="RandomForestRegressor") preds = predict.model_predictions(fset, model) assert all(preds.name == fset.name) assert preds.prediction.values.dtype == np.dtype("float")
def test_invalid_feature_values(): """Test proper exception handling for invalid feature values""" fset = sample_featureset(10, 1, ['x_valid', 'x_inf', 'x_nan'], ['class1', 'class2']) fset.x_inf.values[0, 0] = np.inf fset.x_nan.values[0, 0] = np.nan model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() try: model = build_model.build_model_from_featureset(fset, model) except ValueError as e: assert 'x_valid' not in str(e) assert 'x_inf' in str(e) assert 'x_nan' in str(e) else: raise AssertionError("Exception should have been raised for invalid data.") model = build_model.build_model_from_featureset(fset.drop(['x_inf', 'x_nan']), model) assert isinstance(model, RandomForestClassifier)
def test_model_regression(): """Test model prediction function: regression""" fset = sample_featureset(10, 1, ['amplitude'], [0.1, 0.5]) model = build_model.build_model_from_featureset( fset, model_type='RandomForestRegressor') preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.dtype == np.dtype('float'))
def test_score_model(): """Test calculation of model training score.""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.build_model_from_featureset(fset, model) score = build_model.score_model(model, fset) assert isinstance(score, float)
def test_invalid_feature_values(): """Test proper exception handling for invalid feature values""" fset = sample_featureset(10, 1, ['x_valid', 'x_inf', 'x_nan'], ['class1', 'class2']) fset.x_inf.values[0, 0] = np.inf fset.x_nan.values[0, 0] = np.nan model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() try: model = build_model.build_model_from_featureset(fset, model) except ValueError as e: assert 'x_valid' not in str(e) assert 'x_inf' in str(e) assert 'x_nan' in str(e) else: raise AssertionError("Exception should have been raised for invalid data.") model = build_model.build_model_from_featureset(fset.drop(['x_inf', 'x_nan']), model) assert isinstance(model, RandomForestClassifier)
def test_model_predictions(): """Test inner model prediction function""" fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc")) model = build_model.build_model_from_featureset( fset, model_type='RandomForestClassifier') preds = predict.model_predictions(fset, model) assert(preds.shape[0] == len(fset.name)) assert(preds.shape[1] == len(np.unique(fset.target.values))) assert(preds.values.dtype == np.dtype('float'))
def test_score_model(): """Test calculation of model training score.""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.build_model_from_featureset(fset, model) score = build_model.score_model(model, fset) assert isinstance(score, float)
def test_model_regression(): """Test model prediction function: classification""" fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2']) fset.target.values = np.random.random(len(fset.target.values)) model = build_model.build_model_from_featureset( fset, model_type='RandomForestRegressor') preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.dtype == np.dtype('float'))
def test_predict_optimized_model(): """Test main predict function (classification) w/ optimized model""" fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"]) model = build_model.build_model_from_featureset( fset, model_type="RandomForestClassifier", params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2 ) preds = predict.model_predictions(fset, model) assert all(preds.name == fset.name) assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target))) assert preds.prediction.values.dtype == np.dtype("float")
def build_model_task(model_type, model_params, fset_path, output_path=None, params_to_optimize=None): """TODO""" with xr.open_dataset(fset_path) as fset: model = build_model.build_model_from_featureset(fset, model_type=model_type, model_options=model_params, params_to_optimize=params_to_optimize) if output_path: joblib.dump(model, output_path, compress=3) return model
def create_test_model(fset, model_type='RandomForestClassifier'): """Create and yield test model, then delete. Params ------ fset : `models.Featureset` instance The (labeled) feature set from which to build the model. model_type : str, optional String indicating type of model to build. Defaults to 'RandomForestClassifier'. """ model_params = { "RandomForestClassifier": { "bootstrap": True, "criterion": "gini", "oob_score": False, "max_features": "auto", "n_estimators": 10 }, "RandomForestRegressor": { "bootstrap": True, "criterion": "mse", "oob_score": False, "max_features": "auto", "n_estimators": 10 }, "LinearSGDClassifier": { "loss": "hinge" }, "LinearRegressor": { "fit_intercept": True } } with featureset.from_netcdf(fset.file.uri) as fset_data: model_data = build_model.build_model_from_featureset( fset_data, model_type=model_type) model_path = pjoin(cfg['paths']['models_folder'], '{}.pkl'.format(str(uuid.uuid4()))) joblib.dump(model_data, model_path) f, created = m.File.create_or_get(uri=model_path) model = m.Model.create(name='test_model', file=f, featureset=fset, project=fset.project, params=model_params[model_type], type=model_type, finished=datetime.datetime.now()) model.save() try: yield model finally: model.delete_instance()
def test_model_classification(): """Test model prediction function: classification""" fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"]) model = build_model.build_model_from_featureset(fset, model_type="RandomForestClassifier") preds = predict.model_predictions(fset, model) assert all(preds.name == fset.name) assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target))) assert preds.prediction.values.dtype == np.dtype("float") classes = predict.model_predictions(fset, model, return_probs=False) assert all(classes.name == fset.name) assert classes.prediction.values.shape == (len(fset.name),) assert isinstance(classes.prediction.values[0], (str, bytes))
def test_predict_optimized_model(): """Test main predict function (classification) w/ optimized model""" fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2']) model = build_model.build_model_from_featureset( fset, model_type='RandomForestClassifier', params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2) preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))) assert (preds.prediction.values.dtype == np.dtype('float'))
def test_predict_prefeaturized(): featureset_path = pjoin(DATA_PATH, "test_featureset.nc") fset = xr.open_dataset(featureset_path).load() model = build_model.build_model_from_featureset( fset, model_type='RandomForestClassifier') model_path = pjoin(TEMP_DIR, "test.pkl") joblib.dump(model, model_path, compress=3) preds = predict_prefeaturized_task(featureset_path, model_path)() assert(all(preds.name == fset.name)) assert(preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))) assert(preds.prediction.values.dtype == np.dtype('float'))
def test_predict_optimized_model(): """Test main predict function (classification) w/ optimized model""" fset = xr.open_dataset(pjoin(DATA_PATH, "asas_training_subset_featureset.nc")) model = build_model.build_model_from_featureset(fset, model_type="RandomForestClassifier", params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2) pred_results_dict = predict.predict_data_files(TS_TARGET_PATHS, list(fset.data_vars), model, custom_features_script=None) for fname, results in pred_results_dict.items(): for el in results['pred_results']: print(el) assert(el[0] in ['Mira', 'W_Ursae_Maj', 'Classical_Cepheid'] or el in ['Mira', 'W_Ursae_Maj', 'Classical_Cepheid'])
def test_predict_optimized_model(): """Test main predict function (classification) w/ optimized model""" fset = xr.open_dataset(pjoin(DATA_PATH, "asas_training_subset_featureset.nc")) model = build_model.build_model_from_featureset(fset, model_type="RandomForestClassifier", params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2) model_path = pjoin(TEMP_DIR, "test.pkl") joblib.dump(model, model_path, compress=3) preds = prediction_task(TS_TARGET_PATHS, list(fset.data_vars), model_path, custom_features_script=None)().get() assert(all(preds.prediction.class_label == ['Classical_Cepheid', 'Mira', 'W_Ursae_Maj'])) assert(preds.prediction.values.shape == (len(TS_CLASS_PATHS), len(np.unique(fset.target))))
def _build_model_compute_statistics(fset_path, model_type, model_params, params_to_optimize, model_path): '''Build model and return summary statistics. Parameters ---------- fset_path : str Path to feature set NetCDF file. model_type : str Type of model to be built, e.g. 'RandomForestClassifier'. model_params : dict Dictionary with hyperparameter values to be used in model building. Keys are parameter names, values are the associated parameter values. These hyperparameters will be passed to the model constructor as-is (for hyperparameter optimization, see `params_to_optimize`). params_to_optimize : dict or list of dict During hyperparameter optimization, various model parameters are adjusted to give an optimal fit. This dictionary gives the different values that should be explored for each parameter. E.g., `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all 6 combinations of alpha and beta and compare the resulting models' goodness-of-fit. If None, only those hyperparameters specified in `model_parameters` will be used (passed to model constructor as-is). model_path : str Path indicating where serialized model will be saved. Returns ------- score : float The model's training score. best_params : dict Dictionary of best hyperparameter values (keys are parameter names, values are the corresponding best values) determined by `scikit-learn`'s `GridSearchCV`. If no hyperparameter optimization is performed (i.e. `params_to_optimize` is None or is an empty dict, this will be an empty dict. ''' fset = featureset.from_netcdf(fset_path, engine=cfg['xr_engine']) computed_model = build_model.build_model_from_featureset( featureset=fset, model_type=model_type, model_parameters=model_params, params_to_optimize=params_to_optimize) score = build_model.score_model(computed_model, fset) best_params = computed_model.best_params_ if params_to_optimize else {} joblib.dump(computed_model, model_path) fset.close() return score, best_params
def test_model_classification(): """Test model prediction function: classification""" fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2']) model = build_model.build_model_from_featureset( fset, model_type='RandomForestClassifier') preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))) assert (preds.prediction.values.dtype == np.dtype('float')) classes = predict.model_predictions(fset, model, return_probs=False) assert (all(classes.name == fset.name)) assert (classes.prediction.values.shape == (len(fset.name), )) assert (isinstance(classes.prediction.values[0], (str, bytes)))
def test_predict_regression(): """Test main predict function on multiple files (regression)""" regressor_types = [model_type for model_type, model_class in build_model.MODELS_TYPE_DICT.items() if issubclass(model_class, sklearn.base.RegressorMixin)] fset = xr.open_dataset(pjoin(DATA_PATH, "test_reg_featureset.nc")) for model_type in regressor_types: model = build_model.build_model_from_featureset(fset, model_type=model_type) model_path = pjoin(TEMP_DIR, "test.pkl") joblib.dump(model, model_path, compress=3) preds = prediction_task(TS_TARGET_PATHS, list(fset.data_vars), model_path, custom_features_script=None)().get() assert(preds.prediction.values.shape == (len(TS_CLASS_PATHS),)) assert(p.dtype == np.dtype('float') for p in preds.prediction)
def test_fit_existing_model_optimize(): """Test model building helper function - with param. optimization""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model_options = {"criterion": "gini", "bootstrap": True} params_to_optimize = {"n_estimators": [10, 50, 100], "min_samples_split": [2, 5], "max_features": ["auto", 3]} model = build_model.build_model_from_featureset(fset, model, None, model_options, params_to_optimize) assert hasattr(model, "best_params_") assert hasattr(model, "predict_proba") assert isinstance(model, GridSearchCV) assert isinstance(model.best_estimator_, RandomForestClassifier)
def test_fit_existing_model_optimize(): """Test model building helper function - with param. optimization""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model_options = {"criterion": "gini", "bootstrap": True} params_to_optimize = {"n_estimators": [10, 50, 100], "min_samples_split": [2, 5], "max_features": ["auto", 3]} model = build_model.build_model_from_featureset(fset, model, None, model_options, params_to_optimize) assert hasattr(model, "best_params_") assert hasattr(model, "predict_proba") assert isinstance(model, GridSearchCV) assert isinstance(model.best_estimator_, RandomForestClassifier)
def test_predict_regression(): """Test main predict function on multiple files (regression)""" regressor_types = [model_type for model_type, model_class in build_model.MODELS_TYPE_DICT.items() if issubclass(model_class, sklearn.base.RegressorMixin)] fset = xr.open_dataset(pjoin(DATA_PATH, "test_reg_featureset.nc")) for model_type in regressor_types: model = build_model.build_model_from_featureset(fset, model_type=model_type) pred_results_dict = predict.predict_data_files(TS_TARGET_PATHS, list(fset.data_vars), model, custom_features_script=None) for fname, results in pred_results_dict.items(): for el in results['pred_results']: assert(isinstance(el, float))
def test_fit_existing_model_optimize(): """Test model building helper function - with param. optimization""" fset = xr.open_dataset(pjoin(DATA_PATH, "asas_training_subset_featureset.nc")) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model_options = {"criterion": "gini", "bootstrap": True} params_to_optimize = {"n_estimators": [10, 50, 100], "min_samples_split": [2, 5], "max_features": ["auto", 3]} model = build_model.build_model_from_featureset(fset, model, None, model_options, params_to_optimize) assert hasattr(model, "best_params_") assert hasattr(model, "predict_proba") assert isinstance(model, GridSearchCV) assert isinstance(model.best_estimator_, RandomForestClassifier)
def test_predict_classification(): """Test main predict function on multiple files (classification)""" classifier_types = [model_type for model_type, model_class in build_model.MODELS_TYPE_DICT.items() if issubclass(model_class, sklearn.base.ClassifierMixin)] fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc")) for model_type in classifier_types: model = build_model.build_model_from_featureset(fset, model_type=model_type) pred_results_dict = predict.predict_data_files(TS_CLASS_PATHS, list(fset.data_vars), model, custom_features_script=None) for fname, results in pred_results_dict.items(): for el in results['pred_results']: assert(el[0] in [b'class1', b'class2', b'class3'] or el in [b'class1', b'class2', b'class3'])
def create_test_model(fset, model_type='RandomForestClassifier'): """Create and yield test model, then delete. Params ------ fset : `models.Featureset` instance The (labeled) feature set from which to build the model. model_type : str, optional String indicating type of model to build. Defaults to 'RandomForestClassifier'. """ model_params = { "RandomForestClassifier": { "bootstrap": True, "criterion": "gini", "oob_score": False, "max_features": "auto", "n_estimators": 10}, "RandomForestRegressor": { "bootstrap": True, "criterion": "mse", "oob_score": False, "max_features": "auto", "n_estimators": 10}, "LinearSGDClassifier": { "loss": "hinge"}, "LinearRegressor": { "fit_intercept": True}} with featureset.from_netcdf(fset.file.uri, engine=cfg['xr_engine']) as fset_data: model_data = build_model.build_model_from_featureset(fset_data, model_type=model_type) model_path = pjoin(cfg['paths']['models_folder'], '{}.pkl'.format(str(uuid.uuid4()))) joblib.dump(model_data, model_path) f, created = m.File.create_or_get(uri=model_path) model = m.Model.create(name='test_model', file=f, featureset=fset, project=fset.project, params=model_params[model_type], type=model_type, finished=datetime.datetime.now()) model.save() try: yield model finally: model.delete_instance()
def test_predict_classification(): """Test main predict function on multiple files (classification)""" classifier_types = [model_type for model_type, model_class in build_model.MODELS_TYPE_DICT.items() if issubclass(model_class, sklearn.base.ClassifierMixin)] fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc")) for model_type in classifier_types: model = build_model.build_model_from_featureset(fset, model_type=model_type) model_path = pjoin(TEMP_DIR, "test.pkl") joblib.dump(model, model_path, compress=3) preds = prediction_task(TS_CLASS_PATHS, list(fset.data_vars), model_path, custom_features_script=None)().get() if preds.prediction.values.ravel()[0].dtype == np.dtype('float'): assert(all(preds.prediction.class_label == [b'class1', b'class2', b'class3'])) assert(preds.prediction.values.shape == (len(TS_CLASS_PATHS), len(np.unique(fset.target)))) else: assert(all(p in [b'class1', b'class2', b'class3'] for p in preds.prediction))
def test_fit_existing_model(): """Test model building helper function.""" fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc")) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.build_model_from_featureset(fset, model) assert isinstance(model, RandomForestClassifier)
# # For this example, we'll test a random forest classifier for the built-in # ``cesium`` features, and a 3-nearest neighbors classifier for the others, as # suggested by # `Guo et al. (2012) <http://linkinghub.elsevier.com/retrieve/pii/S0957417411003253>`_. from cesium.build_model import build_model_from_featureset from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.cross_validation import train_test_split train, test = train_test_split(np.arange(len(eeg["classes"])), random_state=0) rfc_param_grid = {'n_estimators': [8, 16, 32, 64, 128, 256, 512, 1024]} model_cesium = build_model_from_featureset(fset_cesium.isel(name=train), RandomForestClassifier(max_features='auto', random_state=0), params_to_optimize=rfc_param_grid) knn_param_grid = {'n_neighbors': [1, 2, 3, 4]} model_guo = build_model_from_featureset(fset_guo.isel(name=train), KNeighborsClassifier(), params_to_optimize=knn_param_grid) model_dwt = build_model_from_featureset(fset_dwt.isel(name=train), KNeighborsClassifier(), params_to_optimize=knn_param_grid) ############################################################################### # Prediction # ---------- # Making predictions for new time series based on these models follows the same # pattern: first the time series are featurized using ``featurize_time_series``, # and then predictions are made based on these features using
# For this example, we'll test a random forest classifier for the built-in # ``cesium`` features, and a 3-nearest neighbors classifier for the others, as # suggested by # `Guo et al. (2012) <http://linkinghub.elsevier.com/retrieve/pii/S0957417411003253>`_. from cesium.build_model import build_model_from_featureset from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split train, test = train_test_split(np.arange(len(eeg["classes"])), random_state=0) rfc_param_grid = {"n_estimators": [8, 16, 32, 64, 128, 256, 512, 1024]} model_cesium = build_model_from_featureset( fset_cesium.isel(name=train), RandomForestClassifier(max_features="auto", random_state=0), params_to_optimize=rfc_param_grid, ) knn_param_grid = {"n_neighbors": [1, 2, 3, 4]} model_guo = build_model_from_featureset( fset_guo.isel(name=train), KNeighborsClassifier(), params_to_optimize=knn_param_grid ) model_dwt = build_model_from_featureset( fset_dwt.isel(name=train), KNeighborsClassifier(), params_to_optimize=knn_param_grid ) ############################################################################### # Prediction # ---------- # Making predictions for new time series based on these models follows the same # pattern: first the time series are featurized using ``featurize_time_series``,