def create_test_prediction(dataset, model): """Create and yield test prediction, then delete. Params ------ dataset : `models.Dataset` instance The dataset on which prediction will be performed. model : `models.Model` instance The model to use to create prediction. """ with featureset.from_netcdf(model.featureset.file.uri, engine=cfg['xr_engine']) as fset_data: model_data = joblib.load(model.file.uri) pred_data = predict.model_predictions(fset_data.load(), model_data) pred_path = pjoin(cfg['paths']['predictions_folder'], '{}.nc'.format(str(uuid.uuid4()))) pred_data.to_netcdf(pred_path, engine=cfg['xr_engine']) f, created = m.File.create_or_get(uri=pred_path) pred = m.Prediction.create(file=f, dataset=dataset, project=dataset.project, model=model, finished=datetime.datetime.now()) pred.save() try: yield pred finally: pred.delete_instance()
def feature_scatterplot(fset_path, features_to_plot): """Create scatter plot of feature set. Parameters ---------- fset_path : str Path to feature set to be plotted. features_to_plot : list of str List of feature names to be plotted. Returns ------- (fig.data, fig.layout) Returns (fig.data, fig.layout) where `fig` is an instance of `plotly.tools.FigureFactory`. """ with featureset.from_netcdf(fset_path, engine=cfg['xr_engine']) as fset: feat_df = fset.to_dataframe() feat_df = feat_df[features_to_plot] if 'target' in fset: feat_df['target'] = fset.target.values index = 'target' else: index = None # TODO replace 'trace {i}' with class labels fig = FF.create_scatterplotmatrix(feat_df, diag='box', index=index, height=800, width=800) py.plot(fig, auto_open=False, output_type='div') return fig.data, fig.layout
def test_from_netcdf(): fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'], labels=['a', 'b', 'c']) data_dir = tempfile.mkdtemp() fset.to_netcdf(pjoin(data_dir, 'test.nc')) loaded = featureset.from_netcdf(pjoin(data_dir, 'test.nc')) assert isinstance(loaded, Featureset) assert set(fset.data_vars) == set(loaded.data_vars) assert set(fset.coords) == set(loaded.coords)
def create_test_model(fset, model_type='RandomForestClassifier'): """Create and yield test model, then delete. Params ------ fset : `models.Featureset` instance The (labeled) feature set from which to build the model. model_type : str, optional String indicating type of model to build. Defaults to 'RandomForestClassifier'. """ model_params = { "RandomForestClassifier": { "bootstrap": True, "criterion": "gini", "oob_score": False, "max_features": "auto", "n_estimators": 10 }, "RandomForestRegressor": { "bootstrap": True, "criterion": "mse", "oob_score": False, "max_features": "auto", "n_estimators": 10 }, "LinearSGDClassifier": { "loss": "hinge" }, "LinearRegressor": { "fit_intercept": True } } with featureset.from_netcdf(fset.file.uri) as fset_data: model_data = build_model.build_model_from_featureset( fset_data, model_type=model_type) model_path = pjoin(cfg['paths']['models_folder'], '{}.pkl'.format(str(uuid.uuid4()))) joblib.dump(model_data, model_path) f, created = m.File.create_or_get(uri=model_path) model = m.Model.create(name='test_model', file=f, featureset=fset, project=fset.project, params=model_params[model_type], type=model_type, finished=datetime.datetime.now()) model.save() try: yield model finally: model.delete_instance()
def test_prediction_to_csv_class(): """Test util.prediction_to_csv""" with create_test_project() as p, create_test_dataset(p) as ds,\ create_test_featureset(p) as fs,\ create_test_model(fs, model_type='LinearSGDClassifier') as m,\ create_test_prediction(ds, m) as pred: pred = featureset.from_netcdf(pred.file.uri) assert util.prediction_to_csv(pred) ==\ [['ts_name', 'true_target', 'prediction'], ['0', 'Mira', 'Mira'], ['1', 'Classical_Cepheid', 'Classical_Cepheid'], ['2', 'Mira', 'Mira'], ['3', 'Classical_Cepheid', 'Classical_Cepheid'], ['4', 'Mira', 'Mira']]
def _build_model_compute_statistics(fset_path, model_type, model_params, params_to_optimize, model_path): '''Build model and return summary statistics. Parameters ---------- fset_path : str Path to feature set NetCDF file. model_type : str Type of model to be built, e.g. 'RandomForestClassifier'. model_params : dict Dictionary with hyperparameter values to be used in model building. Keys are parameter names, values are the associated parameter values. These hyperparameters will be passed to the model constructor as-is (for hyperparameter optimization, see `params_to_optimize`). params_to_optimize : dict or list of dict During hyperparameter optimization, various model parameters are adjusted to give an optimal fit. This dictionary gives the different values that should be explored for each parameter. E.g., `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all 6 combinations of alpha and beta and compare the resulting models' goodness-of-fit. If None, only those hyperparameters specified in `model_parameters` will be used (passed to model constructor as-is). model_path : str Path indicating where serialized model will be saved. Returns ------- score : float The model's training score. best_params : dict Dictionary of best hyperparameter values (keys are parameter names, values are the corresponding best values) determined by `scikit-learn`'s `GridSearchCV`. If no hyperparameter optimization is performed (i.e. `params_to_optimize` is None or is an empty dict, this will be an empty dict. ''' fset = featureset.from_netcdf(fset_path, engine=cfg['xr_engine']) computed_model = build_model.build_model_from_featureset( featureset=fset, model_type=model_type, model_parameters=model_params, params_to_optimize=params_to_optimize) score = build_model.score_model(computed_model, fset) best_params = computed_model.best_params_ if params_to_optimize else {} joblib.dump(computed_model, model_path) fset.close() return score, best_params
def test_prediction_to_csv_regr(): """Test util.prediction_to_csv""" with create_test_project() as p, create_test_dataset(p, label_type='regr') as ds,\ create_test_featureset(p, label_type='regr') as fs,\ create_test_model(fs, model_type='LinearRegressor') as m,\ create_test_prediction(ds, m) as pred: pred = featureset.from_netcdf(pred.file.uri) results = util.prediction_to_csv(pred) assert results[0] == ['ts_name', 'true_target', 'prediction'] npt.assert_array_almost_equal( [[float(e) for e in row] for row in results[1:]], [[0, 2.2, 2.2], [1, 3.4, 3.4], [2, 4.4, 4.4], [3, 2.2, 2.2], [4, 3.1, 3.1]])
def create_test_model(fset, model_type='RandomForestClassifier'): """Create and yield test model, then delete. Params ------ fset : `models.Featureset` instance The (labeled) feature set from which to build the model. model_type : str, optional String indicating type of model to build. Defaults to 'RandomForestClassifier'. """ model_params = { "RandomForestClassifier": { "bootstrap": True, "criterion": "gini", "oob_score": False, "max_features": "auto", "n_estimators": 10}, "RandomForestRegressor": { "bootstrap": True, "criterion": "mse", "oob_score": False, "max_features": "auto", "n_estimators": 10}, "LinearSGDClassifier": { "loss": "hinge"}, "LinearRegressor": { "fit_intercept": True}} with featureset.from_netcdf(fset.file.uri, engine=cfg['xr_engine']) as fset_data: model_data = build_model.build_model_from_featureset(fset_data, model_type=model_type) model_path = pjoin(cfg['paths']['models_folder'], '{}.pkl'.format(str(uuid.uuid4()))) joblib.dump(model_data, model_path) f, created = m.File.create_or_get(uri=model_path) model = m.Model.create(name='test_model', file=f, featureset=fset, project=fset.project, params=model_params[model_type], type=model_type, finished=datetime.datetime.now()) model.save() try: yield model finally: model.delete_instance()
def post(self): data = self.get_json() model_name = data.pop('modelName') featureset_id = data.pop('featureSet') # TODO remove cast once this is passed properly from the front end model_type = sklearn_model_descriptions[int( data.pop('modelType'))]['name'] project_id = data.pop('project') fset = Featureset.get(Featureset.id == featureset_id) if not fset.is_owned_by(self.get_username()): return self.error('No access to featureset') if fset.finished is None: return self.error('Cannot build model for in-progress feature set') model_params = data model_params = { k: robust_literal_eval(v) for k, v in model_params.items() } model_params, params_to_optimize = check_model_param_types( model_type, model_params) model_type = model_type.split()[0] model_path = pjoin(cfg['paths']['models_folder'], '{}_model.pkl'.format(uuid.uuid4())) model_file = File.create(uri=model_path) model = Model.create(name=model_name, file=model_file, featureset=fset, project=fset.project, params=model_params, type=model_type) executor = yield self._get_executor() fset = executor.submit( lambda path: featureset.from_netcdf(path, engine=cfg['xr_engine']), fset.file.uri) imputed_fset = executor.submit(featureset.Featureset.impute, fset) computed_model = executor.submit( build_model.build_model_from_featureset, featureset=imputed_fset, model_type=model_type, model_parameters=model_params, params_to_optimize=params_to_optimize) score_future = executor.submit(build_model.score_model, computed_model, imputed_fset) save_future = executor.submit(joblib.dump, computed_model, model_file.uri) @tornado.gen.coroutine def _wait_and_call(callback, *args, futures=[]): yield _wait(futures_list) return callback(*args) model.task_id = save_future.key model.save() loop = tornado.ioloop.IOLoop.current() loop.add_callback(_wait_and_call, xr.Dataset.close, imputed_fset, futures=[computed_model, score_future, save_future]) loop.spawn_callback(self._await_model, score_future, save_future, model) return self.success(data={'message': "Model training begun."}, action='cesium/FETCH_MODELS')