예제 #1
0
def create_test_prediction(dataset, model):
    """Create and yield test prediction, then delete.

    Params
    ------
    dataset : `models.Dataset` instance
        The dataset on which prediction will be performed.
    model  : `models.Model` instance
        The model to use to create prediction.

    """
    with featureset.from_netcdf(model.featureset.file.uri, engine=cfg['xr_engine']) as fset_data:
        model_data = joblib.load(model.file.uri)
        pred_data = predict.model_predictions(fset_data.load(), model_data)
    pred_path = pjoin(cfg['paths']['predictions_folder'],
                      '{}.nc'.format(str(uuid.uuid4())))
    pred_data.to_netcdf(pred_path, engine=cfg['xr_engine'])
    f, created = m.File.create_or_get(uri=pred_path)
    pred = m.Prediction.create(file=f, dataset=dataset, project=dataset.project,
                               model=model, finished=datetime.datetime.now())
    pred.save()
    try:
        yield pred
    finally:
        pred.delete_instance()
예제 #2
0
def feature_scatterplot(fset_path, features_to_plot):
    """Create scatter plot of feature set.

    Parameters
    ----------
    fset_path : str
        Path to feature set to be plotted.
    features_to_plot : list of str
        List of feature names to be plotted.

    Returns
    -------
    (fig.data, fig.layout)
        Returns (fig.data, fig.layout) where `fig` is an instance of
        `plotly.tools.FigureFactory`.
    """
    with featureset.from_netcdf(fset_path, engine=cfg['xr_engine']) as fset:
        feat_df = fset.to_dataframe()
        feat_df = feat_df[features_to_plot]

        if 'target' in fset:
            feat_df['target'] = fset.target.values
            index = 'target'
        else:
            index = None

    # TODO replace 'trace {i}' with class labels
    fig = FF.create_scatterplotmatrix(feat_df, diag='box', index=index,
                                      height=800, width=800)

    py.plot(fig, auto_open=False, output_type='div')

    return fig.data, fig.layout
예제 #3
0
def test_from_netcdf():
    fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'],
                             labels=['a', 'b', 'c'])
    data_dir = tempfile.mkdtemp()
    fset.to_netcdf(pjoin(data_dir, 'test.nc'))
    loaded = featureset.from_netcdf(pjoin(data_dir, 'test.nc'))
    assert isinstance(loaded, Featureset)
    assert set(fset.data_vars) == set(loaded.data_vars)
    assert set(fset.coords) == set(loaded.coords)
예제 #4
0
def create_test_model(fset, model_type='RandomForestClassifier'):
    """Create and yield test model, then delete.

    Params
    ------
    fset : `models.Featureset` instance
        The (labeled) feature set from which to build the model.
    model_type  : str, optional
        String indicating type of model to build. Defaults to
        'RandomForestClassifier'.

    """
    model_params = {
        "RandomForestClassifier": {
            "bootstrap": True,
            "criterion": "gini",
            "oob_score": False,
            "max_features": "auto",
            "n_estimators": 10
        },
        "RandomForestRegressor": {
            "bootstrap": True,
            "criterion": "mse",
            "oob_score": False,
            "max_features": "auto",
            "n_estimators": 10
        },
        "LinearSGDClassifier": {
            "loss": "hinge"
        },
        "LinearRegressor": {
            "fit_intercept": True
        }
    }
    with featureset.from_netcdf(fset.file.uri) as fset_data:
        model_data = build_model.build_model_from_featureset(
            fset_data, model_type=model_type)
        model_path = pjoin(cfg['paths']['models_folder'],
                           '{}.pkl'.format(str(uuid.uuid4())))
        joblib.dump(model_data, model_path)
    f, created = m.File.create_or_get(uri=model_path)
    model = m.Model.create(name='test_model',
                           file=f,
                           featureset=fset,
                           project=fset.project,
                           params=model_params[model_type],
                           type=model_type,
                           finished=datetime.datetime.now())
    model.save()
    try:
        yield model
    finally:
        model.delete_instance()
예제 #5
0
def test_prediction_to_csv_class():
    """Test util.prediction_to_csv"""
    with create_test_project() as p, create_test_dataset(p) as ds,\
         create_test_featureset(p) as fs,\
         create_test_model(fs, model_type='LinearSGDClassifier') as m,\
         create_test_prediction(ds, m) as pred:
        pred = featureset.from_netcdf(pred.file.uri)
        assert util.prediction_to_csv(pred) ==\
            [['ts_name', 'true_target', 'prediction'],
             ['0', 'Mira', 'Mira'],
             ['1', 'Classical_Cepheid', 'Classical_Cepheid'],
             ['2', 'Mira', 'Mira'],
             ['3', 'Classical_Cepheid', 'Classical_Cepheid'],
             ['4', 'Mira', 'Mira']]
예제 #6
0
def _build_model_compute_statistics(fset_path, model_type, model_params,
                                    params_to_optimize, model_path):
    '''Build model and return summary statistics.

    Parameters
    ----------
    fset_path : str
        Path to feature set NetCDF file.
    model_type : str
        Type of model to be built, e.g. 'RandomForestClassifier'.
    model_params : dict
        Dictionary with hyperparameter values to be used in model building.
        Keys are parameter names, values are the associated parameter values.
        These hyperparameters will be passed to the model constructor as-is
        (for hyperparameter optimization, see `params_to_optimize`).
    params_to_optimize : dict or list of dict
        During hyperparameter optimization, various model parameters
        are adjusted to give an optimal fit. This dictionary gives the
        different values that should be explored for each parameter. E.g.,
        `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all
        6 combinations of alpha and beta and compare the resulting models'
        goodness-of-fit. If None, only those hyperparameters specified in
        `model_parameters` will be used (passed to model constructor as-is).
    model_path : str
        Path indicating where serialized model will be saved.

    Returns
    -------
    score : float
        The model's training score.
    best_params : dict
        Dictionary of best hyperparameter values (keys are parameter names,
        values are the corresponding best values) determined by `scikit-learn`'s
        `GridSearchCV`. If no hyperparameter optimization is performed (i.e.
        `params_to_optimize` is None or is an empty dict, this will be an empty
        dict.
    '''
    fset = featureset.from_netcdf(fset_path, engine=cfg['xr_engine'])
    computed_model = build_model.build_model_from_featureset(
        featureset=fset,
        model_type=model_type,
        model_parameters=model_params,
        params_to_optimize=params_to_optimize)
    score = build_model.score_model(computed_model, fset)
    best_params = computed_model.best_params_ if params_to_optimize else {}
    joblib.dump(computed_model, model_path)
    fset.close()

    return score, best_params
예제 #7
0
def test_prediction_to_csv_regr():
    """Test util.prediction_to_csv"""
    with create_test_project() as p, create_test_dataset(p, label_type='regr') as ds,\
         create_test_featureset(p, label_type='regr') as fs,\
         create_test_model(fs, model_type='LinearRegressor') as m,\
         create_test_prediction(ds, m) as pred:

        pred = featureset.from_netcdf(pred.file.uri)
        results = util.prediction_to_csv(pred)

        assert results[0] == ['ts_name', 'true_target', 'prediction']

        npt.assert_array_almost_equal(
            [[float(e) for e in row] for row in results[1:]],
            [[0, 2.2, 2.2],
             [1, 3.4, 3.4],
             [2, 4.4, 4.4],
             [3, 2.2, 2.2],
             [4, 3.1, 3.1]])
예제 #8
0
def create_test_model(fset, model_type='RandomForestClassifier'):
    """Create and yield test model, then delete.

    Params
    ------
    fset : `models.Featureset` instance
        The (labeled) feature set from which to build the model.
    model_type  : str, optional
        String indicating type of model to build. Defaults to
        'RandomForestClassifier'.

    """
    model_params = {
        "RandomForestClassifier": {
            "bootstrap": True, "criterion": "gini",
            "oob_score": False, "max_features": "auto",
            "n_estimators": 10},
        "RandomForestRegressor": {
            "bootstrap": True, "criterion": "mse",
            "oob_score": False, "max_features": "auto",
            "n_estimators": 10},
        "LinearSGDClassifier": {
            "loss": "hinge"},
        "LinearRegressor": {
            "fit_intercept": True}}
    with featureset.from_netcdf(fset.file.uri, engine=cfg['xr_engine']) as fset_data:
        model_data = build_model.build_model_from_featureset(fset_data,
                                                             model_type=model_type)
        model_path = pjoin(cfg['paths']['models_folder'],
                           '{}.pkl'.format(str(uuid.uuid4())))
        joblib.dump(model_data, model_path)
    f, created = m.File.create_or_get(uri=model_path)
    model = m.Model.create(name='test_model',
                           file=f, featureset=fset, project=fset.project,
                           params=model_params[model_type], type=model_type,
                           finished=datetime.datetime.now())
    model.save()
    try:
        yield model
    finally:
        model.delete_instance()
예제 #9
0
    def post(self):
        data = self.get_json()

        model_name = data.pop('modelName')
        featureset_id = data.pop('featureSet')
        # TODO remove cast once this is passed properly from the front end
        model_type = sklearn_model_descriptions[int(
            data.pop('modelType'))]['name']
        project_id = data.pop('project')

        fset = Featureset.get(Featureset.id == featureset_id)
        if not fset.is_owned_by(self.get_username()):
            return self.error('No access to featureset')

        if fset.finished is None:
            return self.error('Cannot build model for in-progress feature set')

        model_params = data
        model_params = {
            k: robust_literal_eval(v)
            for k, v in model_params.items()
        }

        model_params, params_to_optimize = check_model_param_types(
            model_type, model_params)
        model_type = model_type.split()[0]
        model_path = pjoin(cfg['paths']['models_folder'],
                           '{}_model.pkl'.format(uuid.uuid4()))

        model_file = File.create(uri=model_path)
        model = Model.create(name=model_name,
                             file=model_file,
                             featureset=fset,
                             project=fset.project,
                             params=model_params,
                             type=model_type)

        executor = yield self._get_executor()

        fset = executor.submit(
            lambda path: featureset.from_netcdf(path, engine=cfg['xr_engine']),
            fset.file.uri)
        imputed_fset = executor.submit(featureset.Featureset.impute, fset)
        computed_model = executor.submit(
            build_model.build_model_from_featureset,
            featureset=imputed_fset,
            model_type=model_type,
            model_parameters=model_params,
            params_to_optimize=params_to_optimize)
        score_future = executor.submit(build_model.score_model, computed_model,
                                       imputed_fset)
        save_future = executor.submit(joblib.dump, computed_model,
                                      model_file.uri)

        @tornado.gen.coroutine
        def _wait_and_call(callback, *args, futures=[]):
            yield _wait(futures_list)
            return callback(*args)

        model.task_id = save_future.key
        model.save()

        loop = tornado.ioloop.IOLoop.current()
        loop.add_callback(_wait_and_call,
                          xr.Dataset.close,
                          imputed_fset,
                          futures=[computed_model, score_future, save_future])
        loop.spawn_callback(self._await_model, score_future, save_future,
                            model)

        return self.success(data={'message': "Model training begun."},
                            action='cesium/FETCH_MODELS')