def discretizer2json(discretizer: MDLP, data=None) -> List[dict]: cut_points = discretizer.cut_points_ # type: list category_intervals = [None] * len(cut_points) cut_points = [ None if cut_point is None else cut_point for cut_point in cut_points ] maxs = discretizer.maxs_ mins = discretizer.mins_ # print(cut_points) for i, _cut_points in enumerate(cut_points): if _cut_points is None: continue cats = np.arange(len(_cut_points) + 1) intervals = [[ None if low == -inf else low, None if high == inf else high ] for low, high in discretizer.cat2intervals(cats, i)] category_intervals[i] = intervals return [ { 'cutPoints': cut_points[i], 'intervals': category_intervals[i], 'max': maxs[i], 'min': mins[i], # 'ratios': category_ratios[i] } for i in range(len(cut_points)) ]
class MDLPDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): if(labels is None): raise ValueError('Labels must be not None when using \ MDLPDiscretizer') BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels=labels, random_state=random_state) def bins(self, data, labels): self.transformer = MDLP() discretize_data = self.transformer.fit_transform(data, labels) bins = [] for i in range (len(set(labels))): intervals = set(self.transformer.cat2intervals(discretize_data, i)) feature_interval = [] for i in range (len(intervals)): interval = intervals.pop() feature_interval.append(interval[0]) feature_interval.append(interval[1]) feature_interval = set(feature_interval) feature_interval.discard(float('inf')) feature_interval.discard(float('-inf')) array = [x for x in feature_interval] bins.append(np.array(array)) return bins
def grow(self, data, t_id, level, cur_performance): """ :param data: current data for future tree growth :param t_id: tree id :param level: level id :return: None """ if level >= self.max_depth: return if len(data) == 0: print "?????????????????????? Early Ends ???????????????????????" return self.tree_depths[t_id] = level decision = self.structures[t_id][level] structure = tuple(self.structures[t_id][:level + 1]) cur_selected = self.computed_cache.get(structure, None) Y = data.as_matrix(columns=[self.target]) if not cur_selected: for cue in list(data): if cue in self.ignore or cue == self.target: continue if self.split_method == "MDLP": mdlp = MDLP() X = data.as_matrix(columns=[cue]) X_disc = mdlp.fit_transform(X, Y) X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0)) bins = np.unique(X_disc, axis=0) if len( bins ) <= 1: # MDLP return the whole range as one bin, use median instead. threshold = data[cue].median() for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) continue # print ", ".join([cue, str(bins)+" bins"]) for bin in bins: indexes = np.where(X_disc == bin)[0] interval = X_interval[indexes] try: if len(np.unique(interval, axis=0)) != 1: print "???????????????????????????????????????????????????" except: print 'ha' interval = interval[0] if interval[0] == float('-inf'): threshold = interval[1] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) elif interval[1] == float('inf'): threshold = interval[0] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) else: cur_selected = self.eval_range_split( level, cur_selected, cur_performance, data, cue, indexes, interval, decision) continue elif self.split_method == "percentile": thresholds = set(data[cue].quantile( [x / 20.0 for x in range(1, 20)], interpolation='midpoint')) else: thresholds = [data[cue].median()] # point split, e.g. median or x% percentiles. for threshold in thresholds: for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) self.computed_cache[structure] = cur_selected self.selected[t_id][level] = cur_selected['rule'] self.performance_on_train[t_id][level] = cur_selected[ 'metrics'] + get_performance(cur_selected['metrics']) self.grow(cur_selected['undecided'], t_id, level + 1, cur_selected['metrics'])
train_raw = pd.read_csv("input/train.csv") test_raw = pd.read_csv("input/test.csv") # drop NaNs, use only the Age feature itself to estimate bins train_sur_age = train_raw[['Survived', 'Age']].dropna(axis=0) survived = train_sur_age['Survived'].values age = (train_sur_age['Age'].values).reshape(-1, 1) n_bins = [] age_lim = [] n = 1000 for i in range(n): transformer = MDLP(random_state=i, continuous_features=None) age_dis = transformer.fit_transform(age, survived) age_bins = transformer.cat2intervals(age_dis, 0) n_bins.append(len(set(age_bins))) if len(set(age_bins)) == 2: age_lim.append(age_bins[0]) elif len(set(age_bins)) > 2: print('\t ! more than two bins, n=', len(set(age_bins))) print('* estimated N bins:', set(n_bins)) print('\t mean', np.mean(1. * np.array(n_bins))) print('* Age thresholds, frequencies') lim_val = np.array(age_lim)[:, 0] sum_not_inf = 0 for val_i in set(lim_val): print('\t', val_i, (1. * sum(lim_val == val_i)) / n) sum_not_inf = sum_not_inf + sum(lim_val == val_i) print('\t', 'inf', (n - sum_not_inf) / n)