Example #1
0
def test_get_metadata():
    dataset_config = _get_default_dataset_config()
    dl_backed = dataset._get_dataset(dataset_config)
    metadata = dl_backed.get_metadata()

    assert metadata["train_start_date"] == dataset_config["from_ts"]
    assert metadata["train_end_date"] == dataset_config["to_ts"]
    assert metadata["tag_list"] == dataset_config["tag_list"]
    assert metadata["resolution"] == "10T"

    dataset_config["resolution"] = "10M"
    dl_backed = dataset._get_dataset(dataset_config)
    metadata = dl_backed.get_metadata()
    assert metadata["resolution"] == dataset_config["resolution"]
Example #2
0
def test_get_data_serviceauth_fail(caplog):
    from_ts = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00")
    to_ts = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00")

    dataset_config = _get_default_dataset_config()
    dataset_config["from_ts"] = from_ts
    dataset_config["to_ts"] = to_ts
    dataset_config["data_provider"] = DataLakeProvider(
        dl_service_auth_str="TENTANT_UNKNOWN:BOGUS:PASSWORD"
    )

    dl_backed = dataset._get_dataset(dataset_config)

    with pytest.raises(adal.adal_error.AdalError), caplog.at_level(logging.CRITICAL):
        dl_backed.get_data()
Example #3
0
def test_get_data_serviceauth_in_config():
    dataset_config = _get_default_dataset_config()
    dataset_config["data_provider"] = DataLakeProvider(
        dl_service_auth_str=os.getenv("TEST_SERVICE_AUTH")
    )
    dataset_config["resolution"] = "10T"
    dl_backed = dataset._get_dataset(dataset_config)
    data, _ = dl_backed.get_data()

    assert dataset_config["tag_list"] == list(data.columns.values)

    expected_rows = 7
    assert (
        len(data) == expected_rows
    ), f"Default resolution 10 minutes should give {expected_rows} rows"

    assert (
        not data.isnull().values.any()
    ), "Resulting dataframe should not have any NaNs"
Example #4
0
def test_influx_dataset_attrs(influxdb):
    """
    Test expected attributes
    """
    from_ts = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00")
    to_ts = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00")
    tag_list = tu.SENSORTAG_LIST
    config = {
        "type": "TimeSeriesDataset",
        "from_ts": from_ts,
        "to_ts": to_ts,
        "tag_list": tag_list,
    }
    config["data_provider"] = InfluxDataProvider(
        measurement="sensors",
        value_name="Value",
        client=influx_client_from_uri(uri=tu.INFLUXDB_URI, dataframe_client=True),
    )
    dataset = _get_dataset(config)
    assert hasattr(dataset, "get_metadata")

    metadata = dataset.get_metadata()
    assert isinstance(metadata, dict)
Example #5
0
def test_get_data_interactive():
    dataset_config = _get_default_dataset_config()
    dataset_config["data_provider"] = DataLakeProvider(interactive=True)
    dl_backed = dataset._get_dataset(dataset_config)
    data = dl_backed.get_data()
    assert len(data) >= 0
Example #6
0
def test_init():
    config = _get_default_dataset_config()
    dl_backed = dataset._get_dataset(config)
    assert (
        dl_backed is not None
    ), f"Failed to create dataset object of type {config['type']}"
Example #7
0
def build_model(
    name: str,
    model_config: dict,
    data_config: Union[GordoBaseDataset, dict],
    metadata: dict,
):
    """
    Build a model and serialize to a directory for later serving.

    Parameters
    ----------
    name: str
        Name of model to be built
    model_config: dict
        Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization.
        Example::

          {'type': 'KerasAutoEncoder',
           'kind': 'feedforward_hourglass'}

    data_config: dict
        Mapping of the Dataset to initialize, following the same logic as model_config.
    metadata: dict
        Mapping of arbitrary metadata data.

    Returns
    -------
        Tuple[sklearn.base.BaseEstimator, dict]
    """
    # Get the dataset from config
    logger.debug(f"Initializing Dataset with config {data_config}")

    dataset = (data_config if isinstance(data_config, GordoBaseDataset) else
               _get_dataset(data_config))

    logger.debug("Fetching training data")
    start = time.time()

    X, y = dataset.get_data()

    time_elapsed_data = time.time() - start

    # Get the model and dataset
    logger.debug(f"Initializing Model with config: {model_config}")
    model = serializer.pipeline_from_definition(model_config)

    # Cross validate
    logger.debug(f"Starting to do cross validation")
    start = time.time()

    scores: Dict[str, Any]
    if hasattr(model, "score"):
        cv_scores = cross_val_score(model,
                                    X,
                                    y,
                                    cv=TimeSeriesSplit(n_splits=3))
        scores = {
            "explained-variance": {
                "mean": cv_scores.mean(),
                "std": cv_scores.std(),
                "max": cv_scores.max(),
                "min": cv_scores.min(),
                "raw-scores": cv_scores.tolist(),
            }
        }
    else:
        logger.debug("Unable to score model, has no attribute 'score'.")
        scores = dict()

    cv_duration_sec = time.time() - start

    # Train
    logger.debug("Starting to train model.")
    start = time.time()
    model.fit(X, y)
    time_elapsed_model = time.time() - start

    metadata = {"user-defined": metadata}
    metadata["name"] = name
    metadata["dataset"] = dataset.get_metadata()
    utc_dt = datetime.datetime.now(datetime.timezone.utc)
    metadata["model"] = {
        "model-creation-date": str(utc_dt.astimezone()),
        "model-builder-version": __version__,
        "model-config": model_config,
        "data-query-duration-sec": time_elapsed_data,
        "model-training-duration-sec": time_elapsed_model,
        "cross-validation": {
            "cv-duration-sec": cv_duration_sec,
            "scores": scores
        },
    }

    gordobase_final_step = _get_final_gordo_base_step(model)
    if gordobase_final_step:
        metadata["model"].update(gordobase_final_step.get_metadata())

    return model, metadata
def build_model(
    name: str,
    model_config: dict,
    data_config: Union[GordoBaseDataset, dict],
    metadata: dict,
    evaluation_config: dict = {"cv_mode": "full_build"},
) -> Tuple[Union[BaseEstimator, None], dict]:
    """
    Build a model and serialize to a directory for later serving.

    Parameters
    ----------
    name: str
        Name of model to be built
    model_config: dict
        Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization.
        Example::

          {'type': 'KerasAutoEncoder',
           'kind': 'feedforward_hourglass'}

    data_config: dict
        Mapping of the Dataset to initialize, following the same logic as model_config.
    metadata: dict
        Mapping of arbitrary metadata data.
    evaluation_config: dict
        Dict of parameters which are exposed to build_model.
            - cv_mode: str
                String which enables three different modes, represented as a key value in evaluation_config:
                * cross_val_only: Only perform cross validation
                * build_only: Skip cross validation and only build the model
                * full_build: Cross validation and full build of the model, default value
                Example::

                    {"cv_mode": "cross_val_only"}


    Returns
    -------
        Tuple[Optional[sklearn.base.BaseEstimator], dict]
    """
    # Get the dataset from config
    logger.debug(f"Initializing Dataset with config {data_config}")

    dataset = (data_config if isinstance(data_config, GordoBaseDataset) else
               _get_dataset(data_config))

    logger.debug("Fetching training data")
    start = time.time()

    X, y = dataset.get_data()

    time_elapsed_data = time.time() - start

    # Get the model and dataset
    logger.debug(f"Initializing Model with config: {model_config}")
    model = serializer.pipeline_from_definition(model_config)

    cv_duration_sec = None

    if evaluation_config["cv_mode"].lower() in ("cross_val_only",
                                                "full_build"):
        metrics_list = [
            explained_variance_score,
            r2_score,
            mean_squared_error,
            mean_absolute_error,
        ]
        # Cross validate
        logger.debug("Starting cross validation")
        start = time.time()
        scores: Dict[str, Any] = dict()
        if hasattr(model, "predict"):

            metrics_dict = get_metrics_dict(metrics_list, y)

            cv = cross_validate(
                model,
                X,
                y,
                scoring=metrics_dict,
                return_estimator=True,
                cv=TimeSeriesSplit(n_splits=3),
            )
            for metric, test_metric in map(lambda k: (k, f"test_{k}"),
                                           metrics_dict):
                val = {
                    "fold-mean": cv[test_metric].mean(),
                    "fold-std": cv[test_metric].std(),
                    "fold-max": cv[test_metric].max(),
                    "fold-min": cv[test_metric].min(),
                }
                val.update({
                    f"fold-{i + 1}": raw_value
                    for i, raw_value in enumerate(cv[test_metric].tolist())
                })
                scores.update({metric: val})

        else:
            logger.debug("Unable to score model, has no attribute 'predict'.")
            scores = dict()

        cv_duration_sec = time.time() - start

        # If cross_val_only, return the cv_scores and empty model.
        if evaluation_config["cv_mode"] == "cross_val_only":
            metadata["model"] = {
                "cross-validation": {
                    "cv-duration-sec": cv_duration_sec,
                    "scores": scores,
                }
            }
            return None, metadata
    else:
        # Setting cv scores to zero when not used.
        scores = dict()
    # Train
    logger.debug("Starting to train model.")
    start = time.time()
    model.fit(X, y)
    time_elapsed_model = time.time() - start

    metadata = {"user-defined": metadata}
    metadata["name"] = name
    metadata["dataset"] = dataset.get_metadata()
    utc_dt = datetime.datetime.now(datetime.timezone.utc)
    metadata["model"] = {
        "model-offset": _determine_offset(model, X),
        "model-creation-date": str(utc_dt.astimezone()),
        "model-builder-version": __version__,
        "model-config": model_config,
        "data-query-duration-sec": time_elapsed_data,
        "model-training-duration-sec": time_elapsed_model,
        "cross-validation": {
            "cv-duration-sec": cv_duration_sec,
            "scores": scores
        },
    }

    metadata["model"].update(_get_metadata(model))
    return model, metadata