def convert_remaining_groups_to_rules(data): remaining_groups = list(data['partitions']) for name in remaining_groups: top_group = get_variable_mode(data, name) data = convert_group_to_rules(data, name, baseline_labels = [top_group]) assert check_data(data) return data
def convert_group_to_rules(data, name, baseline_labels = None): assert data['variable_types'][name] == 'categorical' # convert variable from categorical to rules idx = data['variable_names'].index(name) labels = np.unique(data['X'][:, idx]) conversion_dict = {k: [k] for k in labels} if baseline_labels is not None: if type(baseline_labels) is not list: baseline_labels = [baseline_labels] for g in baseline_labels: conversion_dict.pop(g) data = convert_categorical_to_rules(data, name, conversion_dict, prepend_name = True) assert check_data(data) return data
def check_groups(groups, data = None): assert type(groups) is dict if len(groups) == 0: return True group_names = list(groups.keys()) n_samples = len(groups[group_names[0]]['indices']) n_samples_validation = len(groups[group_names[0]]['indices_validation']) n_samples_test = len(groups[group_names[0]]['indices_test']) for group_info in groups.values(): assert check_group_info(group_info) assert len(group_info['indices']) == n_samples assert len(group_info['indices_test']) == n_samples_test assert len(group_info['indices_validation']) == n_samples_validation if data is not None: assert check_data(data) assert n_samples == data['X'].shape[0] if n_samples_test > 0: assert has_test_set(data) assert n_samples_test == data['X_test'].shape[0] if n_samples_validation > 0: assert has_validation_set(data) assert n_samples_validation == data['X_validation'].shape[0] for g in group_names: assert g not in data['variable_names'] assert g not in data['variable_types'] assert g not in data['variable_orderings'] assert g not in data['partitions'] return True
def __init__(self, data, groups, pooled_model, decoupled_models, groups_to_models): # check inputs assert check_data(data, ready_for_training=True) assert check_groups(groups, data) # initialize data self._data = { 'X': np.array(data['X']), 'Y': np.array(data['Y']), 'variable_names': list(data['variable_names']) } self._groups = deepcopy(groups) self._pooled_model = pooled_model self._decoupled_models = decoupled_models group_names, group_values = groups_to_group_data(groups) training_values = np.unique(group_values, axis=0).tolist() training_splits = [tuple(zip(group_names, v)) for v in training_values] assert isinstance(groups_to_models, dict) assert set(training_splits) == set(groups_to_models.keys( )), 'mapper should include map every group in the training data' assignment_idx = np.array(list(groups_to_models.values())) assert np.array_equal(np.unique(assignment_idx), np.arange( len(self))), 'every model should cover at least one group' models_to_groups = {k: [] for k in range(len(self))} for group_tuple, model_index in groups_to_models.items(): group_value = [s[1] for s in group_tuple] assert len(group_value) == len(group_names) models_to_groups[model_index].append(group_value) self._splits = training_splits self.groups_to_models = groups_to_models self.models_to_groups = models_to_groups
def oversample_by_group(data, **kwargs): data, groups = split_groups_from_data(data) # get names/labels/values group_names = [] group_labels = [] group_values = [] for n, g in groups.items(): group_names.append(n) group_values.append(g['indices']) group_labels.append(g['labels'][g['indices']]) group_values = np.transpose(np.vstack(group_values)) group_labels = np.transpose(np.vstack(group_labels)) # get unique ids for each combination of group attributes _, profile_idx = np.unique(group_values, axis = 0, return_inverse = True) profile_labels = range(0, np.max(profile_idx) + 1) # oversample labels ros = RandomOverSampler(**kwargs) X = np.array(data['X']) Y = np.array(data['Y']) X_res = [] Y_res = [] G_res = [] assert np.isin((-1, 1), Y).all() for i in profile_labels: row_idx = np.isin(profile_idx, i) profile_values = group_labels[row_idx, :][0] Xg = X[row_idx, :] Yg = Y[row_idx] if np.isin((-1, 1), Yg).all(): Xs, Ys = ros.fit_sample(Xg, Yg) X_res.append(Xs) Y_res.append(Ys) G_res.append(np.tile(profile_values, (len(Ys), 1))) else: profile_name = ''.join(['%s' % s for s in profile_values]) warnings.warn('missing + and - labels for group %s' % profile_name) X_res.append(Xg) Y_res.append(Yg) G_res.append(np.tile(profile_values, (len(Yg), 1))) G_res = np.vstack(G_res) X_res = np.vstack(X_res) Y_res = np.concatenate(Y_res) data['X'] = X_res data['Y'] = Y_res data['sample_weights'] = np.ones_like(data['Y'], dtype = float) for j, name in enumerate(group_names): data = add_variable(data, name = name, variable_type = 'categorical', is_partition = True, values = G_res[:, j]) assert check_data(data) return data