def test_get_cover_array_and_size(self): sel = ps.EqualitySelector('checking_status', b'no checking') _, size = ps.get_cover_array_and_size(sel, None, self.data) self.assertEqual(size, 394) _, size = ps.get_cover_array_and_size(slice(None), len(self.data), None) self.assertEqual(size, len(self.data)) _, size = ps.get_cover_array_and_size(slice(0, 10), len(self.data)) self.assertEqual(size, 10) _, size = ps.get_cover_array_and_size( np.array([1, 3, 5, 7, 11], dtype=int)) self.assertEqual(size, 5)
def calculate_statistics(self, subgroup, data, cached_statistics=None): if cached_statistics is None or not isinstance(cached_statistics, dict): statistics = dict() elif all(k in cached_statistics for k in PredictionTarget.statistic_types): return cached_statistics else: statistics = cached_statistics cover_arr, size_sg = ps.get_cover_array_and_size( subgroup, len(data), data) statistics['size_sg'] = size_sg statistics['size_dataset'] = data.shape[0] statistics['pos_sg'] = self.target_variable[cover_arr].sum() statistics['pos_dataset'] = self.target_variable.sum() statistics['neg_sg'] = (1 - self.target_variable[cover_arr]).sum() statistics['neg_dataset'] = (1 - self.target_variable).sum() statistics['metric_sg'] = self.evaluation_metric( self.target_variable[cover_arr], self.target_estimate[cover_arr]) statistics['metric_dataset'] = self.evaluation_metric( self.target_variable, self.target_estimate) if not self.eval_dict is None: for key in self.eval_dict.keys(): statistics[key + "_sg"] = self.eval_dict[key]( self.target_variable[cover_arr], self.target_estimate[cover_arr]) statistics[key + "_dataset"] = self.eval_dict[key]( self.target_variable, self.target_estimate) return statistics
def calculate_statistics(self, subgroup, data, cached_statistics=None): if cached_statistics is None or not isinstance(cached_statistics, dict): statistics = dict() elif all(k in cached_statistics for k in NumericTarget.statistic_types): return cached_statistics else: statistics = cached_statistics cover_arr, _ = ps.get_cover_array_and_size(subgroup, len(data), data) all_target_values = data[self.target_variable].to_numpy() sg_target_values = all_target_values[cover_arr] statistics['size_sg'] = len(sg_target_values) statistics['size_dataset'] = len(data) statistics['mean_sg'] = np.mean(sg_target_values) statistics['mean_dataset'] = np.mean(all_target_values) statistics['std_sg'] = np.std(sg_target_values) statistics['std_dataset'] = np.std(all_target_values) statistics['median_sg'] = np.median(sg_target_values) statistics['median_dataset'] = np.median(all_target_values) statistics['max_sg'] = np.max(sg_target_values) statistics['max_dataset'] = np.max(all_target_values) statistics['min_sg'] = np.min(sg_target_values) statistics['min_dataset'] = np.min(all_target_values) statistics['mean_lift'] = statistics['mean_sg'] / statistics['mean_dataset'] statistics['median_lift'] = statistics['median_sg'] / statistics['median_dataset'] return statistics
def calculate_statistics(self, subgroup, data): _, size = ps.get_cover_array_and_size(subgroup, len(data), data) statistics = {} statistics['size_sg'] = size statistics['size_dataset'] = len(data) return statistics
def calculate_statistics(self, subgroup_description, target, data, statistics=None): _, size = ps.get_cover_array_and_size(subgroup_description, self.size_dataset, data) return SimpleCountQF.tpl(size)
def get_base_statistics(self, subgroup, data): cover_arr, size_sg = ps.get_cover_array_and_size(subgroup) size_dataset = data.shape[0] metric_sg = self.evaluation_metric(self.target_variable[cover_arr], self.target_estimate[cover_arr]) metric_dataset = self.evaluation_metric(self.target_variable, self.target_estimate) return (size_sg, size_dataset, metric_sg, metric_dataset)
def fit(self, subgroup, data=None): cover_arr, size = ps.get_cover_array_and_size(subgroup, len(self.x), data) if size <= self.degree + 1: return beta_tuple(np.full(self.degree + 1, np.nan), size) return beta_tuple( np.polyfit(self.x[cover_arr], self.y[cover_arr], deg=self.degree), size)
def get_base_statistics(self, subgroup, data): cover_arr, size_sg = ps.get_cover_array_and_size( subgroup, len(data), data) positives = self.covers(data) instances_subgroup = size_sg positives_dataset = np.sum(positives) instances_dataset = len(data) positives_subgroup = np.sum(positives[cover_arr]) return instances_dataset, positives_dataset, instances_subgroup, positives_subgroup
def covers(self, instance): cover_arr, _ = ps.get_cover_array_and_size(self.subgroup_description, len(instance), instance) if not isinstance(cover_arr, type(np.array)): arr = np.zeros(len(instance), dtype=bool) arr[cover_arr] = True return arr else: return cover_arr
def get_base_statistics(self, subgroup, data): cover_arr, size_sg = ps.get_cover_array_and_size(subgroup, len(data), data) all_target_values = data[self.target_variable] sg_target_values = all_target_values[cover_arr] instances_dataset = len(data) instances_subgroup = size_sg mean_sg = np.mean(sg_target_values) mean_dataset = np.mean(all_target_values) return (instances_dataset, mean_dataset, instances_subgroup, mean_sg)
def calculate_statistics(self, subgroup, target, data, statistics=None): cover_arr, sg_size = ps.get_cover_array_and_size(subgroup, len(self.all_target_values), data) sg_median = np.array([0]) sg_target_values = 0 if sg_size > 0: sg_target_values = self.all_target_values[cover_arr] sg_median = np.median(sg_target_values) estimate = self.estimator.get_estimate(subgroup, sg_size, sg_median, cover_arr, sg_target_values) else: estimate = float('-inf') return StandardQFNumericMedian.tpl(sg_size, sg_median, estimate)
def calculate_statistics(self, subgroup, target, data, statistics=None): cover_arr, sg_size = ps.get_cover_array_and_size( subgroup, len(self.all_target_variable), data) if sg_size > 0 and np.std(self.all_target_variable[cover_arr]) != 0: sg_target_variable = self.all_target_variable[cover_arr] sg_target_estimate = self.all_target_estimate[cover_arr] estimate = self.estimator.get_estimate(sg_size, self.a) metric_sg = target.evaluation_metric(sg_target_variable, sg_target_estimate) else: estimate = float('-inf') metric_sg = 0 # float('-inf') return PredictionQFNumeric.tpl(sg_size, metric_sg, estimate)
def calculate_statistics(self, subgroup_description, data, cached_statistics=None): if cached_statistics is None or not isinstance(cached_statistics, dict): statistics = dict() elif all(k in cached_statistics for k in FITarget.statistic_types): return cached_statistics else: statistics = cached_statistics _, size = ps.get_cover_array_and_size(subgroup_description, len(data), data) statistics['size_sg'] = size statistics['size_dataset'] = len(data) return statistics
def calculate_statistics(self, subgroup, data): cover_arr, _ = ps.get_cover_array_and_size(subgroup, len(data), data) all_target_values = data[self.target_variable].to_numpy() sg_target_values = all_target_values[cover_arr] statistics = {} statistics['size_sg'] = len(sg_target_values) statistics['size_dataset'] = len(data) statistics['mean_sg'] = np.mean(sg_target_values) statistics['mean_dataset'] = np.mean(all_target_values) statistics['std_sg'] = np.std(sg_target_values) statistics['std_dataset'] = np.std(all_target_values) statistics['median_sg'] = np.median(sg_target_values) statistics['median_dataset'] = np.median(all_target_values) statistics['max_sg'] = np.max(sg_target_values) statistics['max_dataset'] = np.max(all_target_values) statistics['min_sg'] = np.min(sg_target_values) statistics['min_dataset'] = np.min(all_target_values) statistics[ 'mean_lift'] = statistics['mean_sg'] / statistics['mean_dataset'] statistics['median_lift'] = statistics['median_sg'] / statistics[ 'median_dataset'] return statistics
def calculate_statistics(self, subgroup, data=None): cover_arr, sg_size = ps.get_cover_array_and_size( subgroup, self.data_size, data) params = self.model.fit(cover_arr, data) return self.get_tuple(sg_size, params, cover_arr)
def get_base_statistics(self, subgroup, data): _, size = ps.get_cover_array_and_size(subgroup, len(data), data) return size
def calculate_statistics(self, subgroup, data=None): cover_arr, size_sg = ps.get_cover_array_and_size( subgroup, len(self.positives), data) return SimplePositivesQF.tpl( size_sg, np.count_nonzero(self.positives[cover_arr]))