Esempio n. 1
0
    def load_derived(db_root, db, key):  # pragma: io
        """Load a dataset under a key name in the database as derived data. This is the inverse of :func:`.save_derived`.

        Parameters
        ----------
        db_root : str
            Absolute path to the database.
        db : str
            The name of the database.
        key : str
            The variable name in the database for the data.

        Returns
        -------
        data : :class:`xarray:xarray.Dataset`
            An :class:`xarray:xarray.Dataset` variable for the derived data from experiments.
        meta : json-serializable
            Associated meta-data with the experiments. This can be anything json serializable.
        """
        XRSerializer._validate(db_root, keys=[key], db=db)

        fname = XRSerializer._key_to_fname(key)
        path = (db_root, db, _DERIVED_DIR, fname)
        with open(join_safe_r(*path), "r") as f:
            data, meta = _load_xr(f)
        return data, meta
Esempio n. 2
0
    def load(db_root, db, key, uuid_):  # pragma: io
        """Load a dataset under a key name in the database. This is the inverse of :func:`.save`.

        Parameters
        ----------
        db_root : str
            Absolute path to the database.
        db : str
            The name of the database.
        key : str
            The variable name in the database for the data.
        uuid_ : uuid.UUID
            The UUID to represent the version of this variable we want to load.

        Returns
        -------
        data : :class:`xarray:xarray.Dataset`
            An :class:`xarray:xarray.Dataset` variable for the non-derived data from an experiment.
        meta : json-serializable
            Associated meta-data with the experiment. This can be anything json serializable.
        """
        XRSerializer._validate(db_root, keys=[key], db=db)

        fname = XRSerializer._uuid_to_fname(uuid_)
        path = (db_root, db, key, fname)
        with open(join_safe_r(*path), "r") as f:
            ds, meta = _load_xr(f)
        return ds, meta
Esempio n. 3
0
def _csv_loader(dataset_name, return_X_y, data_root, clip_x=100):  # pragma: io
    """Load custom csv files for use in the benchmark.

    This function assumes ``dataset_name + ".csv"`` is a csv file found in the `data_root` path.  It also assumes the
    last column of the csv file is the target and the other columns are features.

    The target column should be `int` for classification and `float` for regression. Column names ending in ``"_cat"``
    are assumed to be categorical and will be one-hot encoded.

    The features (and target for regression) are robust standardized. The features are also clipped to be in
    ``[-clip_x, clip_x]`` *after* standardization.
    """
    assert return_X_y, "Only returning (X,y) tuple supported right now."
    assert clip_x >= 0

    # Quantile range for robust standardization. The 86% range is the most efficient for Gaussians. See:
    # https://github.com/scikit-learn/scikit-learn/issues/10139#issuecomment-344705040
    q_level = 0.86

    path = join_safe_r(data_root, dataset_name + ".csv")

    # For now, use convention that can get problem type based on data set name
    problem_type = get_problem_type(dataset_name)

    # Assuming no missing data in source csv files at the moment, these will
    # result in error.
    df = pd.read_csv(path,
                     header=0,
                     index_col=False,
                     engine="c",
                     na_filter=False,
                     true_values=["true"],
                     false_values=["false"])

    label = df.columns[-1]  # Assume last col is target

    target = df.pop(label).values
    if problem_type == ProblemType.clf:
        # all problems classification problems seen so far are binary
        assert target.dtype == np.bool_
        target = target.astype(np.int_)  # convert to int for skl
    if problem_type == ProblemType.reg:
        assert target.dtype == np.float_
        # 86% range is the most efficient (at least for Gaussians)
        target = robust_standardize(target, q_level=q_level)

    # Fill in an categorical variables (object dtype of cols names ..._cat)
    cat_cols = sorted(cc for cc in df.columns
                      if cc.endswith("_cat") or df[cc].dtype.kind == "O")
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=np.float_)
    # Could also sort all columns to be sure it will be reprod

    # Everything should now be in float
    assert (df.dtypes == np.float_).all()

    data = df.values
    data = robust_standardize(data, q_level=q_level)
    # Debatable if we should include this, but there are a lot of outliers
    data = np.clip(data, -clip_x, clip_x)

    # We should probably do some logging or something to wrap up
    return data, target, problem_type