def _select_columns(data, observation, variable_names=None, selected_variables=None):
    """
    Select data with specified columns

    :param data: array or DataFrame with observations
    :param observation: reference observation for neighbours selection
    :param variable_names: names of all variables
    :param selected_variables: names of selected variables
    :return: DataFrame with observations and pandas Series with referenced observation, with selected columns
    """
    if selected_variables is None:
        return pd.DataFrame(data), transform_into_Series(observation)
    try:
        indices = [variable_names.index(var) for var in selected_variables]
    except ValueError:
        logging.warning("Selected variables: {} is not a subset of variables: {}".format(
            selected_variables, variable_names))
        return pd.DataFrame(data), transform_into_Series(observation)

    if not isinstance(data, pd.core.frame.DataFrame):
        data = pd.DataFrame(data)

    subset_data = data.iloc[:, indices].reset_index(drop=True)
    observation = transform_into_Series(observation)
    return subset_data, observation[indices]
def select_neighbours(data,
                      observation,
                      y=None,
                      variable_names=None,
                      selected_variables=None,
                      dist_fun='gower',
                      n=20):
    """
    Select observations from dataset, that are similar to a given observation

    :param data: array or DataFrame with observations
    :param observation: reference observation for neighbours selection
    :param y: labels for observations
    :param variable_names: names of variables
    :param selected_variables: selected variables - require supplying variable names along with data
    :param dist_fun: 'gower' or distance function, as pairwise distances in sklearn, gower works with missing data
    :param n: size of the sample
    :return: DataFrame with selected observations and pandas Series with corresponding labels if provided
    """
    if n > data.shape[0]:
        logging.warning("Given n ({}) is larger than data size ({})".format(
            n, data.shape[0]))
        n = data.shape[0]

    if not isinstance(data, pd.core.frame.DataFrame):
        data = pd.DataFrame(data)

    observation = transform_into_Series(observation)

    # columns are selected for the purpose of distance calculation
    selected_data, observation = _select_columns(data, observation,
                                                 variable_names,
                                                 selected_variables)

    if dist_fun == 'gower':
        distances = gower_distances(selected_data, observation)
    else:
        if not callable(dist_fun):
            raise ValueError('Distance has to be "gower" or a custom function')
        distances = dist_fun([observation], selected_data)[0]

    indices = np.argpartition(distances, n - 1)[:n]

    # selected points have all variables
    selected_points = data.iloc[indices]
    selected_points.reset_index(drop=True, inplace=True)

    if y is not None:
        y = transform_into_Series(y)
        return selected_points, y.iloc[indices].reset_index(drop=True)
    else:
        return selected_points
def select_sample(data, y=None, n=15, seed=42):
    """
    Select sample from dataset.

    :param data: array or dataframe with observations
    :param y: labels for observations
    :param n: size of the sample
    :param seed: seed for random number generator
    :return: selected observations and corresponding labels if provided
    """
    np.random.seed(seed)
    if n > data.shape[0]:
        logging.warning("Given n ({}) is larger than data size ({})".format(
            n, data.shape[0]))
        n = data.shape[0]
    indices = np.random.choice(data.shape[0], n, replace=False)

    if isinstance(data, pd.core.frame.DataFrame):
        sampled_x = data.iloc[indices]
        sampled_x.reset_index(drop=True, inplace=True)
    else:
        sampled_x = data[indices, :]

    if y is not None:
        y = transform_into_Series(y)
        return sampled_x, y[indices].reset_index(drop=True)
    else:
        return sampled_x
Beispiel #4
0
def individual_variable_profile(explainer,
                                new_observation,
                                y=None,
                                variables=None,
                                grid_points=101,
                                variable_splits=None):
    """
    Calculate ceteris paribus profile

    :param explainer: a model to be explained
    :param new_observation: a new observation for which the profiles are calculated
    :param y: y true labels for `new_observation`. If specified then will be added to ceteris paribus plots
    :param variables: collection of variables selected for calculating profiles
    :param grid_points: number of points for profile
    :param variable_splits: dictionary of splits for variables, in most cases created with `_calculate_variable_splits()`. If None then it will be calculated based on validation data avaliable in the `explainer`.
    :return: instance of CeterisParibus class
    """
    variables = _get_variables(variables, explainer)
    if not isinstance(new_observation, pd.core.frame.DataFrame):
        new_observation = np.array(new_observation)
        if new_observation.ndim == 1:
            # make 1D array 2D
            new_observation = new_observation.reshape((1, -1))
        new_observation = pd.DataFrame(new_observation,
                                       columns=explainer.var_names)
    else:
        try:
            new_observation.columns = explainer.var_names
        except ValueError as e:
            raise ValueError(
                "Mismatched number of variables {} instead of {}".format(
                    len(new_observation.columns), len(explainer.var_names)))

    if y is not None:
        y = transform_into_Series(y)

    cp_profile = CeterisParibus(explainer, new_observation, y, variables,
                                grid_points, variable_splits)
    return cp_profile
 def test_transform_into_Series_3(self):
     a = pd.DataFrame(OrderedDict(zip(['a', 'b'], [[1, 2, 3], [4, 2, 1]])))
     b = transform_into_Series(a)
     np.testing.assert_array_equal(b, [1, 2, 3])
 def test_transform_into_Series_2(self):
     a = np.array([4, 1, 6])
     b = transform_into_Series(a)
     np.testing.assert_array_equal(a, b)
 def test_transform_into_Series_1(self):
     a = [1, 4, 2]
     b = transform_into_Series(a)
     np.testing.assert_array_equal(a, b)