Beispiel #1
0
def test_is_nested_dataframe(n_instances, n_columns, n_timepoints):
    """Test is_nested_dataframe for correctness."""
    array = np.random.normal(size=(n_instances, n_columns, n_timepoints))
    nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints)
    zero_df = pd.DataFrame(np.zeros_like(nested))
    nested_heterogenous = pd.concat([zero_df, nested], axis=1)

    mi_df = make_multi_index_dataframe(
        n_instances=n_instances, n_timepoints=n_timepoints, n_columns=n_columns
    )

    assert not is_nested_dataframe(array)
    assert not is_nested_dataframe(mi_df)
    assert is_nested_dataframe(nested)
    assert is_nested_dataframe(nested_heterogenous)
Beispiel #2
0
def test_from_2d_array_to_nested(n_instances, n_columns, n_timepoints):
    """Test from_2d_array_to_nested for correctness."""
    rng = np.random.default_rng()
    X_2d = rng.standard_normal((n_instances, n_timepoints))
    nested_df = from_2d_array_to_nested(X_2d)

    assert is_nested_dataframe(nested_df)
    assert nested_df.shape == (n_instances, 1)
def check_panel_to_panel_transform_univariate(Estimator):
    n_instances = 5
    out = _construct_fit_transform(Estimator, n_instances=n_instances)
    assert isinstance(out, (pd.DataFrame, np.ndarray))
    assert out.shape[0] == n_instances
    if isinstance(out, np.ndarray):
        assert out.ndim == 3
    if isinstance(out, pd.DataFrame):
        assert is_nested_dataframe(out)
Beispiel #4
0
def test_from_long_to_nested(n_instances, n_columns, n_timepoints):
    """Test from_long_to_nested for correctness."""
    X_long = generate_example_long_table(
        num_cases=n_instances, series_len=n_timepoints, num_dims=n_columns
    )
    nested_df = from_long_to_nested(X_long)

    assert is_nested_dataframe(nested_df)
    assert nested_df.shape == (n_instances, n_columns)
Beispiel #5
0
def test_from_3d_numpy_to_nested(n_instances, n_columns, n_timepoints):
    """Test from_3d_numpy_to_nested for correctness."""
    array = np.random.normal(size=(n_instances, n_columns, n_timepoints))
    nested = from_3d_numpy_to_nested(array)

    # check types and shapes
    assert is_nested_dataframe(nested)
    assert nested.shape == (n_instances, n_columns)
    assert nested.iloc[0, 0].shape[0] == n_timepoints

    # check values of random series
    np.testing.assert_array_equal(nested.iloc[1, 0], array[1, 0, :])
Beispiel #6
0
def test_from_multi_index_to_nested(n_instances, n_columns, n_timepoints):
    """Test from_multi_index_to_nested for correctness."""
    mi_df = make_multi_index_dataframe(
        n_instances=n_instances, n_timepoints=n_timepoints, n_columns=n_columns
    )
    nested_df = from_multi_index_to_nested(
        mi_df, instance_index="case_id", cells_as_numpy=False
    )

    assert is_nested_dataframe(nested_df)
    assert nested_df.shape == (n_instances, n_columns)
    assert (nested_df.columns == mi_df.columns).all()
Beispiel #7
0
def check_panel_to_panel_transform_multivariate(Estimator):
    n_instances = 5
    if Estimator.get_class_tag("univariate-only", False):
        _check_raises_error(Estimator, n_instances=n_instances, n_columns=3)
    else:
        out = _construct_fit_transform(Estimator,
                                       n_instances=n_instances,
                                       n_columns=3)
        assert isinstance(out, (pd.DataFrame, np.ndarray))
        assert out.shape[0] == n_instances
        if isinstance(out, np.ndarray):
            assert out.ndim == 3
        if isinstance(out, pd.DataFrame):
            assert is_nested_dataframe(out)
Beispiel #8
0
def from_nested_to_3d_numpy(X):
    """Convert nested Panel to 3D numpy Panel.

    Convert nested pandas DataFrame (with time series as pandas Series
    in cells) into NumPy ndarray with shape
    (n_instances, n_columns, n_timepoints).

    Parameters
    ----------
    X : pd.DataFrame
        Nested pandas DataFrame

    Returns
    -------
    X_3d : np.ndarrray
        3-dimensional NumPy array
    """
    # n_instances, n_columns = X.shape
    # n_timepoints = X.iloc[0, 0].shape[0]
    # array = np.empty((n_instances, n_columns, n_timepoints))
    # for column in range(n_columns):
    #     array[:, column, :] = X.iloc[:, column].tolist()
    # return array
    if not is_nested_dataframe(X):
        raise ValueError("Input DataFrame is not a nested DataFrame")

    # n_columns = X.shape[1]
    nested_col_mask = [*are_columns_nested(X)]

    # If all the columns are nested in structure
    if nested_col_mask.count(True) == len(nested_col_mask):
        X_3d = np.stack(
            X.applymap(_convert_series_cell_to_numpy)
            .apply(lambda row: np.stack(row), axis=1)
            .to_numpy()
        )

    # If some columns are primitive (non-nested) then first convert to
    # multi-indexed DataFrame where the same value of these columns is
    # repeated for each timepoint
    # Then the multi-indexed DataFrame can be converted to 3d NumPy array
    else:
        X_mi = from_nested_to_multi_index(X)
        X_3d = from_multi_index_to_3d_numpy(
            X_mi, instance_index="instance", time_index="timepoints"
        )

    return X_3d
Beispiel #9
0
def _compare_nested_frame(func, x, y, **kwargs):
    """Compare two nested pd.DataFrames.

    Parameters
    ----------
    func : function
        Function from np.testing for comparing arrays.
    x : pd.DataFrame
    y : pd.DataFrame
    kwargs : dict
        Keyword argument for function

    Raises
    ------
    AssertionError
        If x and y are not equal
    """
    # We iterate over columns and rows to make cell-wise comparisons.
    # Tabularizing the data first would simplify this, but does not
    # work for unequal length data.

    # In rare cases, x and y may be empty (e.g. TSFreshRelevantFeatureExtractor) and
    # we cannot compare individual cells, so we simply check if everything else is
    # equal here.
    assert isinstance(x, pd.DataFrame)
    if x.empty:
        assert_frame_equal(x, y)

    elif is_nested_dataframe(x):
        # Check if both inputs have the same shape
        if not x.shape == y.shape:
            raise ValueError("Found inputs with different shapes")

        # Iterate over columns
        n_columns = x.shape[1]
        for i in range(n_columns):
            xc = x.iloc[:, i].tolist()
            yc = y.iloc[:, i].tolist()

            # Iterate over rows, checking if individual cells are equal
            for xci, yci in zip(xc, yc):
                func(xci, yci, **kwargs)
Beispiel #10
0
def from_nested_to_multi_index(X, instance_index=None, time_index=None):
    """Convert nested pandas Panel to multi-index pandas Panel.

    Converts nested pandas DataFrame (with time series as pandas Series
    or NumPy array in cells) into multi-indexed pandas DataFrame.

    Can convert mixed nested and primitive DataFrame to multi-index DataFrame.

    Parameters
    ----------
    X : pd.DataFrame
        The nested DataFrame to convert to a multi-indexed pandas DataFrame

    instance_index : str
        Name of the multi-index level corresponding to the DataFrame's instances

    time_index : str
        Name of multi-index level corresponding to DataFrame's timepoints

    Returns
    -------
    X_mi : pd.DataFrame
        The multi-indexed pandas DataFrame

    """
    if not is_nested_dataframe(X):
        raise ValueError("Input DataFrame is not a nested DataFrame")

    if time_index is None:
        time_index_name = "timepoints"
    else:
        time_index_name = time_index

    # n_columns = X.shape[1]
    nested_col_mask = [*are_columns_nested(X)]

    if instance_index is None:
        instance_idxs = X.index.get_level_values(-1).unique()
        # n_instances = instance_idxs.shape[0]
        instance_index_name = "instance"

    else:
        if instance_index in X.index.names:
            instance_idxs = X.index.get_level_values(instance_index).unique()
        else:
            instance_idxs = X.index.get_level_values(-1).unique()
        # n_instances = instance_idxs.shape[0]
        instance_index_name = instance_index

    instances = []
    for instance_idx in instance_idxs:
        iidx = instance_idx
        series = [i[1] for i in X.loc[iidx, :].iteritems()]
        colnames = [i[0] for i in X.loc[iidx, :].iteritems()]
        for x in series:
            x.name = None

        instance = [
            pd.DataFrame(s, columns=[c]) for s, c in zip(series, colnames)
        ]
        instance = pd.concat(instance, axis=1)
        # For primitive (non-nested column) assume the same
        # primitive value applies to every timepoint of the instance
        for col_idx, is_nested in enumerate(nested_col_mask):
            if not is_nested:
                instance.iloc[:, col_idx] = instance.iloc[:, col_idx].ffill()

        # Correctly assign multi-index
        multi_index = pd.MultiIndex.from_product(
            [[instance_idx], instance.index],
            names=[instance_index_name, time_index_name],
        )
        instance.index = multi_index
        instances.append(instance)

    X_mi = pd.concat(instances)
    X_mi.columns = X.columns

    return X_mi
Beispiel #11
0
def check_X(
    X,
    enforce_univariate=False,
    enforce_min_instances=1,
    enforce_min_columns=1,
    coerce_to_numpy=False,
    coerce_to_pandas=False,
):
    """Validate input data.

    Parameters
    ----------
    X : pd.DataFrame or np.array
        Input data
    enforce_univariate : bool, optional (default=False)
        Enforce that X is univariate.
    enforce_min_instances : int, optional (default=1)
        Enforce minimum number of instances.
    enforce_min_columns : int, optional (default=1)
        Enforce minimum number of columns (or time-series variables).
    coerce_to_numpy : bool, optional (default=False)
        If True, X will be coerced to a 3-dimensional numpy array.
    coerce_to_pandas : bool, optional (default=False)
        If True, X will be coerced to a nested pandas DataFrame.

    Returns
    -------
    X : pd.DataFrame or np.array
        Checked and possibly converted input data

    Raises
    ------
    ValueError
        If X is invalid input data
    """
    # check input type
    if coerce_to_pandas and coerce_to_numpy:
        raise ValueError(
            "`coerce_to_pandas` and `coerce_to_numpy` cannot both be set to True"
        )

    if not isinstance(X, VALID_X_TYPES):
        raise ValueError(f"X must be a pd.DataFrame or a np.array, "
                         f"but found: {type(X)}")

    # check np.array
    # check first if we have the right number of dimensions, otherwise we
    # may not be able to get the shape of the second dimension below
    if isinstance(X, np.ndarray):
        if not X.ndim == 3:
            raise ValueError(
                f"If passed as a np.array, X must be a 3-dimensional "
                f"array, but found shape: {X.shape}")
        if coerce_to_pandas:
            X = from_3d_numpy_to_nested(X)

    # enforce minimum number of columns
    n_columns = X.shape[1]
    if n_columns < enforce_min_columns:
        raise ValueError(
            f"X must contain at least: {enforce_min_columns} columns, "
            f"but found only: {n_columns}.")

    # enforce univariate data
    if enforce_univariate and n_columns > 1:
        raise ValueError(
            f"X must be univariate with X.shape[1] == 1, but found: "
            f"X.shape[1] == {n_columns}.")

    # enforce minimum number of instances
    if enforce_min_instances > 0:
        _enforce_min_instances(X, min_instances=enforce_min_instances)

    # check pd.DataFrame
    if isinstance(X, pd.DataFrame):
        if not is_nested_dataframe(X):
            raise ValueError(
                "If passed as a pd.DataFrame, X must be a nested "
                "pd.DataFrame, with pd.Series or np.arrays inside cells.")
        # convert pd.DataFrame
        if coerce_to_numpy:
            X = from_nested_to_3d_numpy(X)

    return X