def test_get_metadata(): dataset_config = _get_default_dataset_config() dl_backed = dataset._get_dataset(dataset_config) metadata = dl_backed.get_metadata() assert metadata["train_start_date"] == dataset_config["from_ts"] assert metadata["train_end_date"] == dataset_config["to_ts"] assert metadata["tag_list"] == dataset_config["tag_list"] assert metadata["resolution"] == "10T" dataset_config["resolution"] = "10M" dl_backed = dataset._get_dataset(dataset_config) metadata = dl_backed.get_metadata() assert metadata["resolution"] == dataset_config["resolution"]
def test_get_data_serviceauth_fail(caplog): from_ts = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00") to_ts = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00") dataset_config = _get_default_dataset_config() dataset_config["from_ts"] = from_ts dataset_config["to_ts"] = to_ts dataset_config["data_provider"] = DataLakeProvider( dl_service_auth_str="TENTANT_UNKNOWN:BOGUS:PASSWORD" ) dl_backed = dataset._get_dataset(dataset_config) with pytest.raises(adal.adal_error.AdalError), caplog.at_level(logging.CRITICAL): dl_backed.get_data()
def test_get_data_serviceauth_in_config(): dataset_config = _get_default_dataset_config() dataset_config["data_provider"] = DataLakeProvider( dl_service_auth_str=os.getenv("TEST_SERVICE_AUTH") ) dataset_config["resolution"] = "10T" dl_backed = dataset._get_dataset(dataset_config) data, _ = dl_backed.get_data() assert dataset_config["tag_list"] == list(data.columns.values) expected_rows = 7 assert ( len(data) == expected_rows ), f"Default resolution 10 minutes should give {expected_rows} rows" assert ( not data.isnull().values.any() ), "Resulting dataframe should not have any NaNs"
def test_influx_dataset_attrs(influxdb): """ Test expected attributes """ from_ts = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00") to_ts = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00") tag_list = tu.SENSORTAG_LIST config = { "type": "TimeSeriesDataset", "from_ts": from_ts, "to_ts": to_ts, "tag_list": tag_list, } config["data_provider"] = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=tu.INFLUXDB_URI, dataframe_client=True), ) dataset = _get_dataset(config) assert hasattr(dataset, "get_metadata") metadata = dataset.get_metadata() assert isinstance(metadata, dict)
def test_get_data_interactive(): dataset_config = _get_default_dataset_config() dataset_config["data_provider"] = DataLakeProvider(interactive=True) dl_backed = dataset._get_dataset(dataset_config) data = dl_backed.get_data() assert len(data) >= 0
def test_init(): config = _get_default_dataset_config() dl_backed = dataset._get_dataset(config) assert ( dl_backed is not None ), f"Failed to create dataset object of type {config['type']}"
def build_model( name: str, model_config: dict, data_config: Union[GordoBaseDataset, dict], metadata: dict, ): """ Build a model and serialize to a directory for later serving. Parameters ---------- name: str Name of model to be built model_config: dict Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization. Example:: {'type': 'KerasAutoEncoder', 'kind': 'feedforward_hourglass'} data_config: dict Mapping of the Dataset to initialize, following the same logic as model_config. metadata: dict Mapping of arbitrary metadata data. Returns ------- Tuple[sklearn.base.BaseEstimator, dict] """ # Get the dataset from config logger.debug(f"Initializing Dataset with config {data_config}") dataset = (data_config if isinstance(data_config, GordoBaseDataset) else _get_dataset(data_config)) logger.debug("Fetching training data") start = time.time() X, y = dataset.get_data() time_elapsed_data = time.time() - start # Get the model and dataset logger.debug(f"Initializing Model with config: {model_config}") model = serializer.pipeline_from_definition(model_config) # Cross validate logger.debug(f"Starting to do cross validation") start = time.time() scores: Dict[str, Any] if hasattr(model, "score"): cv_scores = cross_val_score(model, X, y, cv=TimeSeriesSplit(n_splits=3)) scores = { "explained-variance": { "mean": cv_scores.mean(), "std": cv_scores.std(), "max": cv_scores.max(), "min": cv_scores.min(), "raw-scores": cv_scores.tolist(), } } else: logger.debug("Unable to score model, has no attribute 'score'.") scores = dict() cv_duration_sec = time.time() - start # Train logger.debug("Starting to train model.") start = time.time() model.fit(X, y) time_elapsed_model = time.time() - start metadata = {"user-defined": metadata} metadata["name"] = name metadata["dataset"] = dataset.get_metadata() utc_dt = datetime.datetime.now(datetime.timezone.utc) metadata["model"] = { "model-creation-date": str(utc_dt.astimezone()), "model-builder-version": __version__, "model-config": model_config, "data-query-duration-sec": time_elapsed_data, "model-training-duration-sec": time_elapsed_model, "cross-validation": { "cv-duration-sec": cv_duration_sec, "scores": scores }, } gordobase_final_step = _get_final_gordo_base_step(model) if gordobase_final_step: metadata["model"].update(gordobase_final_step.get_metadata()) return model, metadata
def build_model( name: str, model_config: dict, data_config: Union[GordoBaseDataset, dict], metadata: dict, evaluation_config: dict = {"cv_mode": "full_build"}, ) -> Tuple[Union[BaseEstimator, None], dict]: """ Build a model and serialize to a directory for later serving. Parameters ---------- name: str Name of model to be built model_config: dict Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization. Example:: {'type': 'KerasAutoEncoder', 'kind': 'feedforward_hourglass'} data_config: dict Mapping of the Dataset to initialize, following the same logic as model_config. metadata: dict Mapping of arbitrary metadata data. evaluation_config: dict Dict of parameters which are exposed to build_model. - cv_mode: str String which enables three different modes, represented as a key value in evaluation_config: * cross_val_only: Only perform cross validation * build_only: Skip cross validation and only build the model * full_build: Cross validation and full build of the model, default value Example:: {"cv_mode": "cross_val_only"} Returns ------- Tuple[Optional[sklearn.base.BaseEstimator], dict] """ # Get the dataset from config logger.debug(f"Initializing Dataset with config {data_config}") dataset = (data_config if isinstance(data_config, GordoBaseDataset) else _get_dataset(data_config)) logger.debug("Fetching training data") start = time.time() X, y = dataset.get_data() time_elapsed_data = time.time() - start # Get the model and dataset logger.debug(f"Initializing Model with config: {model_config}") model = serializer.pipeline_from_definition(model_config) cv_duration_sec = None if evaluation_config["cv_mode"].lower() in ("cross_val_only", "full_build"): metrics_list = [ explained_variance_score, r2_score, mean_squared_error, mean_absolute_error, ] # Cross validate logger.debug("Starting cross validation") start = time.time() scores: Dict[str, Any] = dict() if hasattr(model, "predict"): metrics_dict = get_metrics_dict(metrics_list, y) cv = cross_validate( model, X, y, scoring=metrics_dict, return_estimator=True, cv=TimeSeriesSplit(n_splits=3), ) for metric, test_metric in map(lambda k: (k, f"test_{k}"), metrics_dict): val = { "fold-mean": cv[test_metric].mean(), "fold-std": cv[test_metric].std(), "fold-max": cv[test_metric].max(), "fold-min": cv[test_metric].min(), } val.update({ f"fold-{i + 1}": raw_value for i, raw_value in enumerate(cv[test_metric].tolist()) }) scores.update({metric: val}) else: logger.debug("Unable to score model, has no attribute 'predict'.") scores = dict() cv_duration_sec = time.time() - start # If cross_val_only, return the cv_scores and empty model. if evaluation_config["cv_mode"] == "cross_val_only": metadata["model"] = { "cross-validation": { "cv-duration-sec": cv_duration_sec, "scores": scores, } } return None, metadata else: # Setting cv scores to zero when not used. scores = dict() # Train logger.debug("Starting to train model.") start = time.time() model.fit(X, y) time_elapsed_model = time.time() - start metadata = {"user-defined": metadata} metadata["name"] = name metadata["dataset"] = dataset.get_metadata() utc_dt = datetime.datetime.now(datetime.timezone.utc) metadata["model"] = { "model-offset": _determine_offset(model, X), "model-creation-date": str(utc_dt.astimezone()), "model-builder-version": __version__, "model-config": model_config, "data-query-duration-sec": time_elapsed_data, "model-training-duration-sec": time_elapsed_model, "cross-validation": { "cv-duration-sec": cv_duration_sec, "scores": scores }, } metadata["model"].update(_get_metadata(model)) return model, metadata