Example #1
0
def test_fit_multichannel():
    """Test model building helper function for multi-channel feature data."""
    fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    assert isinstance(model, RandomForestClassifier)
Example #2
0
def test_fit_multichannel():
    """Test model building helper function for multi-channel feature data."""
    fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    assert isinstance(model, RandomForestClassifier)
Example #3
0
def test_model_regression():
    """Test model prediction function: regression"""
    fset = sample_featureset(10, 1, ["amplitude"], [0.1, 0.5])
    model = build_model.build_model_from_featureset(fset, model_type="RandomForestRegressor")
    preds = predict.model_predictions(fset, model)
    assert all(preds.name == fset.name)
    assert preds.prediction.values.dtype == np.dtype("float")
Example #4
0
def test_invalid_feature_values():
    """Test proper exception handling for invalid feature values"""
    fset = sample_featureset(10, 1, ['x_valid', 'x_inf', 'x_nan'], ['class1', 'class2'])
    fset.x_inf.values[0, 0] = np.inf
    fset.x_nan.values[0, 0] = np.nan
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    try:
        model = build_model.build_model_from_featureset(fset, model)
    except ValueError as e:
        assert 'x_valid' not in str(e)
        assert 'x_inf' in str(e)
        assert 'x_nan' in str(e)
    else:
        raise AssertionError("Exception should have been raised for invalid data.")

    model = build_model.build_model_from_featureset(fset.drop(['x_inf', 'x_nan']), model)
    assert isinstance(model, RandomForestClassifier)
Example #5
0
def test_model_regression():
    """Test model prediction function: regression"""
    fset = sample_featureset(10, 1, ['amplitude'], [0.1, 0.5])
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestRegressor')
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.dtype == np.dtype('float'))
Example #6
0
def test_score_model():
    """Test calculation of model training score."""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    score = build_model.score_model(model, fset)
    assert isinstance(score, float)
Example #7
0
def test_invalid_feature_values():
    """Test proper exception handling for invalid feature values"""
    fset = sample_featureset(10, 1, ['x_valid', 'x_inf', 'x_nan'], ['class1', 'class2'])
    fset.x_inf.values[0, 0] = np.inf
    fset.x_nan.values[0, 0] = np.nan
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    try:
        model = build_model.build_model_from_featureset(fset, model)
    except ValueError as e:
        assert 'x_valid' not in str(e)
        assert 'x_inf' in str(e)
        assert 'x_nan' in str(e)
    else:
        raise AssertionError("Exception should have been raised for invalid data.")

    model = build_model.build_model_from_featureset(fset.drop(['x_inf', 'x_nan']), model)
    assert isinstance(model, RandomForestClassifier)
Example #8
0
def test_model_predictions():
    """Test inner model prediction function"""
    fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc"))
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestClassifier')
    preds = predict.model_predictions(fset, model)
    assert(preds.shape[0] == len(fset.name))
    assert(preds.shape[1] == len(np.unique(fset.target.values)))
    assert(preds.values.dtype == np.dtype('float'))
Example #9
0
def test_score_model():
    """Test calculation of model training score."""
    fset = sample_featureset(10, 1,
                             ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    score = build_model.score_model(model, fset)
    assert isinstance(score, float)
Example #10
0
def test_model_regression():
    """Test model prediction function: classification"""
    fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
    fset.target.values = np.random.random(len(fset.target.values))
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestRegressor')
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.dtype == np.dtype('float'))
Example #11
0
def test_predict_optimized_model():
    """Test main predict function (classification) w/ optimized model"""
    fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"])
    model = build_model.build_model_from_featureset(
        fset, model_type="RandomForestClassifier", params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2
    )
    preds = predict.model_predictions(fset, model)
    assert all(preds.name == fset.name)
    assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))
    assert preds.prediction.values.dtype == np.dtype("float")
Example #12
0
def build_model_task(model_type, model_params, fset_path, output_path=None,
                     params_to_optimize=None):
    """TODO"""
    with xr.open_dataset(fset_path) as fset:
        model = build_model.build_model_from_featureset(fset,
            model_type=model_type, model_options=model_params,
            params_to_optimize=params_to_optimize)
        if output_path:
            joblib.dump(model, output_path, compress=3)

        return model
Example #13
0
def create_test_model(fset, model_type='RandomForestClassifier'):
    """Create and yield test model, then delete.

    Params
    ------
    fset : `models.Featureset` instance
        The (labeled) feature set from which to build the model.
    model_type  : str, optional
        String indicating type of model to build. Defaults to
        'RandomForestClassifier'.

    """
    model_params = {
        "RandomForestClassifier": {
            "bootstrap": True,
            "criterion": "gini",
            "oob_score": False,
            "max_features": "auto",
            "n_estimators": 10
        },
        "RandomForestRegressor": {
            "bootstrap": True,
            "criterion": "mse",
            "oob_score": False,
            "max_features": "auto",
            "n_estimators": 10
        },
        "LinearSGDClassifier": {
            "loss": "hinge"
        },
        "LinearRegressor": {
            "fit_intercept": True
        }
    }
    with featureset.from_netcdf(fset.file.uri) as fset_data:
        model_data = build_model.build_model_from_featureset(
            fset_data, model_type=model_type)
        model_path = pjoin(cfg['paths']['models_folder'],
                           '{}.pkl'.format(str(uuid.uuid4())))
        joblib.dump(model_data, model_path)
    f, created = m.File.create_or_get(uri=model_path)
    model = m.Model.create(name='test_model',
                           file=f,
                           featureset=fset,
                           project=fset.project,
                           params=model_params[model_type],
                           type=model_type,
                           finished=datetime.datetime.now())
    model.save()
    try:
        yield model
    finally:
        model.delete_instance()
Example #14
0
def test_model_classification():
    """Test model prediction function: classification"""
    fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"])
    model = build_model.build_model_from_featureset(fset, model_type="RandomForestClassifier")
    preds = predict.model_predictions(fset, model)
    assert all(preds.name == fset.name)
    assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))
    assert preds.prediction.values.dtype == np.dtype("float")

    classes = predict.model_predictions(fset, model, return_probs=False)
    assert all(classes.name == fset.name)
    assert classes.prediction.values.shape == (len(fset.name),)
    assert isinstance(classes.prediction.values[0], (str, bytes))
Example #15
0
def test_predict_optimized_model():
    """Test main predict function (classification) w/ optimized model"""
    fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
    model = build_model.build_model_from_featureset(
        fset,
        model_type='RandomForestClassifier',
        params_to_optimize={"n_estimators": [10, 50, 100]},
        cv=2)
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.shape == (len(fset.name),
                                              len(np.unique(fset.target))))
    assert (preds.prediction.values.dtype == np.dtype('float'))
Example #16
0
def test_predict_prefeaturized():
    featureset_path = pjoin(DATA_PATH, "test_featureset.nc")
    fset = xr.open_dataset(featureset_path).load()
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestClassifier')
    model_path = pjoin(TEMP_DIR, "test.pkl")
    joblib.dump(model, model_path, compress=3)
    preds = predict_prefeaturized_task(featureset_path, model_path)()

    assert(all(preds.name == fset.name))
    assert(preds.prediction.values.shape == (len(fset.name),
                                             len(np.unique(fset.target))))
    assert(preds.prediction.values.dtype == np.dtype('float'))
Example #17
0
def test_predict_optimized_model():
    """Test main predict function (classification) w/ optimized model"""
    fset = xr.open_dataset(pjoin(DATA_PATH, "asas_training_subset_featureset.nc"))
    model = build_model.build_model_from_featureset(fset,
                model_type="RandomForestClassifier",
                params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2)
    pred_results_dict = predict.predict_data_files(TS_TARGET_PATHS,
                                                   list(fset.data_vars), model,
                                                   custom_features_script=None)
    for fname, results in pred_results_dict.items():
        for el in results['pred_results']:
            print(el)
            assert(el[0] in ['Mira', 'W_Ursae_Maj', 'Classical_Cepheid']
                   or el in ['Mira', 'W_Ursae_Maj', 'Classical_Cepheid'])
Example #18
0
def test_predict_optimized_model():
    """Test main predict function (classification) w/ optimized model"""
    fset = xr.open_dataset(pjoin(DATA_PATH, "asas_training_subset_featureset.nc"))
    model = build_model.build_model_from_featureset(fset,
                model_type="RandomForestClassifier",
                params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2)
    model_path = pjoin(TEMP_DIR, "test.pkl")
    joblib.dump(model, model_path, compress=3)
    preds = prediction_task(TS_TARGET_PATHS, list(fset.data_vars), model_path,
                            custom_features_script=None)().get()
    assert(all(preds.prediction.class_label == ['Classical_Cepheid', 'Mira',
                                                'W_Ursae_Maj']))
    assert(preds.prediction.values.shape == (len(TS_CLASS_PATHS),
                                             len(np.unique(fset.target))))
Example #19
0
def _build_model_compute_statistics(fset_path, model_type, model_params,
                                    params_to_optimize, model_path):
    '''Build model and return summary statistics.

    Parameters
    ----------
    fset_path : str
        Path to feature set NetCDF file.
    model_type : str
        Type of model to be built, e.g. 'RandomForestClassifier'.
    model_params : dict
        Dictionary with hyperparameter values to be used in model building.
        Keys are parameter names, values are the associated parameter values.
        These hyperparameters will be passed to the model constructor as-is
        (for hyperparameter optimization, see `params_to_optimize`).
    params_to_optimize : dict or list of dict
        During hyperparameter optimization, various model parameters
        are adjusted to give an optimal fit. This dictionary gives the
        different values that should be explored for each parameter. E.g.,
        `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all
        6 combinations of alpha and beta and compare the resulting models'
        goodness-of-fit. If None, only those hyperparameters specified in
        `model_parameters` will be used (passed to model constructor as-is).
    model_path : str
        Path indicating where serialized model will be saved.

    Returns
    -------
    score : float
        The model's training score.
    best_params : dict
        Dictionary of best hyperparameter values (keys are parameter names,
        values are the corresponding best values) determined by `scikit-learn`'s
        `GridSearchCV`. If no hyperparameter optimization is performed (i.e.
        `params_to_optimize` is None or is an empty dict, this will be an empty
        dict.
    '''
    fset = featureset.from_netcdf(fset_path, engine=cfg['xr_engine'])
    computed_model = build_model.build_model_from_featureset(
        featureset=fset,
        model_type=model_type,
        model_parameters=model_params,
        params_to_optimize=params_to_optimize)
    score = build_model.score_model(computed_model, fset)
    best_params = computed_model.best_params_ if params_to_optimize else {}
    joblib.dump(computed_model, model_path)
    fset.close()

    return score, best_params
Example #20
0
def test_model_classification():
    """Test model prediction function: classification"""
    fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestClassifier')
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.shape == (len(fset.name),
                                              len(np.unique(fset.target))))
    assert (preds.prediction.values.dtype == np.dtype('float'))

    classes = predict.model_predictions(fset, model, return_probs=False)
    assert (all(classes.name == fset.name))
    assert (classes.prediction.values.shape == (len(fset.name), ))
    assert (isinstance(classes.prediction.values[0], (str, bytes)))
Example #21
0
def test_predict_regression():
    """Test main predict function on multiple files (regression)"""
    regressor_types = [model_type for model_type, model_class
                       in build_model.MODELS_TYPE_DICT.items()
                       if issubclass(model_class, sklearn.base.RegressorMixin)]
    fset = xr.open_dataset(pjoin(DATA_PATH, "test_reg_featureset.nc"))
    for model_type in regressor_types:
        model = build_model.build_model_from_featureset(fset,
                                                        model_type=model_type)
        model_path = pjoin(TEMP_DIR, "test.pkl")
        joblib.dump(model, model_path, compress=3)
        preds = prediction_task(TS_TARGET_PATHS, list(fset.data_vars),
                                model_path,
                                custom_features_script=None)().get()
        assert(preds.prediction.values.shape == (len(TS_CLASS_PATHS),))
        assert(p.dtype == np.dtype('float') for p in preds.prediction)
Example #22
0
def test_fit_existing_model_optimize():
    """Test model building helper function - with param. optimization"""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model_options = {"criterion": "gini", "bootstrap": True}
    params_to_optimize = {"n_estimators": [10, 50, 100],
                          "min_samples_split": [2, 5],
                          "max_features": ["auto", 3]}
    model = build_model.build_model_from_featureset(fset, model, None,
                                                    model_options,
                                                    params_to_optimize)
    assert hasattr(model, "best_params_")
    assert hasattr(model, "predict_proba")
    assert isinstance(model, GridSearchCV)
    assert isinstance(model.best_estimator_, RandomForestClassifier)
Example #23
0
def test_fit_existing_model_optimize():
    """Test model building helper function - with param. optimization"""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model_options = {"criterion": "gini", "bootstrap": True}
    params_to_optimize = {"n_estimators": [10, 50, 100],
                          "min_samples_split": [2, 5],
                          "max_features": ["auto", 3]}
    model = build_model.build_model_from_featureset(fset, model, None,
                                                    model_options,
                                                    params_to_optimize)
    assert hasattr(model, "best_params_")
    assert hasattr(model, "predict_proba")
    assert isinstance(model, GridSearchCV)
    assert isinstance(model.best_estimator_, RandomForestClassifier)
Example #24
0
def test_predict_regression():
    """Test main predict function on multiple files (regression)"""
    regressor_types = [model_type for model_type, model_class
                       in build_model.MODELS_TYPE_DICT.items()
                       if issubclass(model_class, sklearn.base.RegressorMixin)]
    fset = xr.open_dataset(pjoin(DATA_PATH, "test_reg_featureset.nc"))
    for model_type in regressor_types:
        model = build_model.build_model_from_featureset(fset,
                                                        model_type=model_type)
        pred_results_dict = predict.predict_data_files(TS_TARGET_PATHS,
                                                       list(fset.data_vars),
                                                       model,
                                                       custom_features_script=None)
        for fname, results in pred_results_dict.items():
            for el in results['pred_results']:
                assert(isinstance(el, float))
Example #25
0
def test_fit_existing_model_optimize():
    """Test model building helper function - with param. optimization"""
    fset = xr.open_dataset(pjoin(DATA_PATH, "asas_training_subset_featureset.nc"))
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model_options = {"criterion": "gini",
                     "bootstrap": True}
    params_to_optimize = {"n_estimators": [10, 50, 100],
                          "min_samples_split": [2, 5],
                          "max_features": ["auto", 3]}
    model = build_model.build_model_from_featureset(fset, model, None,
                                                    model_options,
                                                    params_to_optimize)
    assert hasattr(model, "best_params_")
    assert hasattr(model, "predict_proba")
    assert isinstance(model, GridSearchCV)
    assert isinstance(model.best_estimator_, RandomForestClassifier)
Example #26
0
def test_predict_classification():
    """Test main predict function on multiple files (classification)"""
    classifier_types = [model_type for model_type, model_class
                        in build_model.MODELS_TYPE_DICT.items()
                        if issubclass(model_class,
                                      sklearn.base.ClassifierMixin)]
    fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc"))
    for model_type in classifier_types:
        model = build_model.build_model_from_featureset(fset,
                                                        model_type=model_type)
        pred_results_dict = predict.predict_data_files(TS_CLASS_PATHS,
                                                       list(fset.data_vars),
                                                       model,
                                                       custom_features_script=None)
        for fname, results in pred_results_dict.items():
            for el in results['pred_results']:
                assert(el[0] in [b'class1', b'class2', b'class3']
                       or el in [b'class1', b'class2', b'class3'])
Example #27
0
def create_test_model(fset, model_type='RandomForestClassifier'):
    """Create and yield test model, then delete.

    Params
    ------
    fset : `models.Featureset` instance
        The (labeled) feature set from which to build the model.
    model_type  : str, optional
        String indicating type of model to build. Defaults to
        'RandomForestClassifier'.

    """
    model_params = {
        "RandomForestClassifier": {
            "bootstrap": True, "criterion": "gini",
            "oob_score": False, "max_features": "auto",
            "n_estimators": 10},
        "RandomForestRegressor": {
            "bootstrap": True, "criterion": "mse",
            "oob_score": False, "max_features": "auto",
            "n_estimators": 10},
        "LinearSGDClassifier": {
            "loss": "hinge"},
        "LinearRegressor": {
            "fit_intercept": True}}
    with featureset.from_netcdf(fset.file.uri, engine=cfg['xr_engine']) as fset_data:
        model_data = build_model.build_model_from_featureset(fset_data,
                                                             model_type=model_type)
        model_path = pjoin(cfg['paths']['models_folder'],
                           '{}.pkl'.format(str(uuid.uuid4())))
        joblib.dump(model_data, model_path)
    f, created = m.File.create_or_get(uri=model_path)
    model = m.Model.create(name='test_model',
                           file=f, featureset=fset, project=fset.project,
                           params=model_params[model_type], type=model_type,
                           finished=datetime.datetime.now())
    model.save()
    try:
        yield model
    finally:
        model.delete_instance()
Example #28
0
def test_predict_classification():
    """Test main predict function on multiple files (classification)"""
    classifier_types = [model_type for model_type, model_class
                        in build_model.MODELS_TYPE_DICT.items()
                        if issubclass(model_class,
                                      sklearn.base.ClassifierMixin)]
    fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc"))
    for model_type in classifier_types:
        model = build_model.build_model_from_featureset(fset,
                                                        model_type=model_type)
        model_path = pjoin(TEMP_DIR, "test.pkl")
        joblib.dump(model, model_path, compress=3)
        preds = prediction_task(TS_CLASS_PATHS, list(fset.data_vars),
                                model_path,
                                custom_features_script=None)().get()
        if preds.prediction.values.ravel()[0].dtype == np.dtype('float'):
            assert(all(preds.prediction.class_label == [b'class1', b'class2',
                                                        b'class3']))
            assert(preds.prediction.values.shape ==
                   (len(TS_CLASS_PATHS), len(np.unique(fset.target))))
        else:
            assert(all(p in [b'class1', b'class2', b'class3'] for p in
                       preds.prediction))
Example #29
0
def test_fit_existing_model():
    """Test model building helper function."""
    fset = xr.open_dataset(pjoin(DATA_PATH, "test_featureset.nc"))
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    assert isinstance(model, RandomForestClassifier)
Example #30
0
#
# For this example, we'll test a random forest classifier for the built-in
# ``cesium`` features, and a 3-nearest neighbors classifier for the others, as
# suggested by
# `Guo et al. (2012) <http://linkinghub.elsevier.com/retrieve/pii/S0957417411003253>`_.

from cesium.build_model import build_model_from_featureset
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split

train, test = train_test_split(np.arange(len(eeg["classes"])), random_state=0)

rfc_param_grid = {'n_estimators': [8, 16, 32, 64, 128, 256, 512, 1024]}
model_cesium = build_model_from_featureset(fset_cesium.isel(name=train),
                                          RandomForestClassifier(max_features='auto',
                                                                 random_state=0),
                                          params_to_optimize=rfc_param_grid)
knn_param_grid = {'n_neighbors': [1, 2, 3, 4]}
model_guo = build_model_from_featureset(fset_guo.isel(name=train),
                                        KNeighborsClassifier(),
                                        params_to_optimize=knn_param_grid)
model_dwt = build_model_from_featureset(fset_dwt.isel(name=train),
                                        KNeighborsClassifier(),
                                        params_to_optimize=knn_param_grid)

###############################################################################
# Prediction
# ----------
# Making predictions for new time series based on these models follows the same
# pattern: first the time series are featurized using ``featurize_time_series``,
# and then predictions are made based on these features using
Example #31
0
# For this example, we'll test a random forest classifier for the built-in
# ``cesium`` features, and a 3-nearest neighbors classifier for the others, as
# suggested by
# `Guo et al. (2012) <http://linkinghub.elsevier.com/retrieve/pii/S0957417411003253>`_.

from cesium.build_model import build_model_from_featureset
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

train, test = train_test_split(np.arange(len(eeg["classes"])), random_state=0)

rfc_param_grid = {"n_estimators": [8, 16, 32, 64, 128, 256, 512, 1024]}
model_cesium = build_model_from_featureset(
    fset_cesium.isel(name=train),
    RandomForestClassifier(max_features="auto", random_state=0),
    params_to_optimize=rfc_param_grid,
)
knn_param_grid = {"n_neighbors": [1, 2, 3, 4]}
model_guo = build_model_from_featureset(
    fset_guo.isel(name=train), KNeighborsClassifier(), params_to_optimize=knn_param_grid
)
model_dwt = build_model_from_featureset(
    fset_dwt.isel(name=train), KNeighborsClassifier(), params_to_optimize=knn_param_grid
)

###############################################################################
# Prediction
# ----------
# Making predictions for new time series based on these models follows the same
# pattern: first the time series are featurized using ``featurize_time_series``,