Example #1
0
def discretizer2json(discretizer: MDLP, data=None) -> List[dict]:
    cut_points = discretizer.cut_points_  # type: list
    category_intervals = [None] * len(cut_points)
    cut_points = [
        None if cut_point is None else cut_point for cut_point in cut_points
    ]
    maxs = discretizer.maxs_
    mins = discretizer.mins_
    # print(cut_points)
    for i, _cut_points in enumerate(cut_points):
        if _cut_points is None:
            continue
        cats = np.arange(len(_cut_points) + 1)
        intervals = [[
            None if low == -inf else low, None if high == inf else high
        ] for low, high in discretizer.cat2intervals(cats, i)]
        category_intervals[i] = intervals

    return [
        {
            'cutPoints': cut_points[i],
            'intervals': category_intervals[i],
            'max': maxs[i],
            'min': mins[i],
            # 'ratios': category_ratios[i]
        } for i in range(len(cut_points))
    ]
Example #2
0
class MDLPDiscretizer(BaseDiscretizer):
    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
        if(labels is None):
            raise ValueError('Labels must be not None when using \
                             MDLPDiscretizer')
        BaseDiscretizer.__init__(self, data, categorical_features,
                                 feature_names, labels=labels,
                                 random_state=random_state)

    def bins(self, data, labels):
        self.transformer = MDLP()
        discretize_data = self.transformer.fit_transform(data, labels)
        bins = []
        for i in range (len(set(labels))):
            intervals = set(self.transformer.cat2intervals(discretize_data, i))
            feature_interval = []
            for i in range (len(intervals)):
                interval = intervals.pop()
                feature_interval.append(interval[0])
                feature_interval.append(interval[1])
            feature_interval = set(feature_interval)
            feature_interval.discard(float('inf'))
            feature_interval.discard(float('-inf'))
            array = [x for x in feature_interval]
            bins.append(np.array(array))
        return bins
Example #3
0
    def grow(self, data, t_id, level, cur_performance):
        """
        :param data: current data for future tree growth
        :param t_id: tree id
        :param level: level id
        :return: None
        """
        if level >= self.max_depth:
            return
        if len(data) == 0:
            print "?????????????????????? Early Ends ???????????????????????"
            return
        self.tree_depths[t_id] = level
        decision = self.structures[t_id][level]
        structure = tuple(self.structures[t_id][:level + 1])
        cur_selected = self.computed_cache.get(structure, None)
        Y = data.as_matrix(columns=[self.target])
        if not cur_selected:
            for cue in list(data):
                if cue in self.ignore or cue == self.target:
                    continue
                if self.split_method == "MDLP":
                    mdlp = MDLP()
                    X = data.as_matrix(columns=[cue])
                    X_disc = mdlp.fit_transform(X, Y)
                    X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0))
                    bins = np.unique(X_disc, axis=0)
                    if len(
                            bins
                    ) <= 1:  # MDLP return the whole range as one bin, use median instead.
                        threshold = data[cue].median()
                        for direction in "><":
                            cur_selected = self.eval_point_split(
                                level, cur_selected, cur_performance, data,
                                cue, direction, threshold, decision)
                        continue
                    # print ", ".join([cue, str(bins)+" bins"])
                    for bin in bins:
                        indexes = np.where(X_disc == bin)[0]
                        interval = X_interval[indexes]
                        try:
                            if len(np.unique(interval, axis=0)) != 1:
                                print "???????????????????????????????????????????????????"
                        except:
                            print 'ha'
                        interval = interval[0]
                        if interval[0] == float('-inf'):
                            threshold = interval[1]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        elif interval[1] == float('inf'):
                            threshold = interval[0]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        else:
                            cur_selected = self.eval_range_split(
                                level, cur_selected, cur_performance, data,
                                cue, indexes, interval, decision)
                    continue
                elif self.split_method == "percentile":
                    thresholds = set(data[cue].quantile(
                        [x / 20.0 for x in range(1, 20)],
                        interpolation='midpoint'))
                else:
                    thresholds = [data[cue].median()]
                # point split, e.g. median or x% percentiles.
                for threshold in thresholds:
                    for direction in "><":
                        cur_selected = self.eval_point_split(
                            level, cur_selected, cur_performance, data, cue,
                            direction, threshold, decision)

            self.computed_cache[structure] = cur_selected
        self.selected[t_id][level] = cur_selected['rule']
        self.performance_on_train[t_id][level] = cur_selected[
            'metrics'] + get_performance(cur_selected['metrics'])
        self.grow(cur_selected['undecided'], t_id, level + 1,
                  cur_selected['metrics'])
Example #4
0
train_raw = pd.read_csv("input/train.csv")
test_raw = pd.read_csv("input/test.csv")

# drop NaNs, use only the Age feature itself to estimate bins
train_sur_age = train_raw[['Survived', 'Age']].dropna(axis=0)
survived = train_sur_age['Survived'].values
age = (train_sur_age['Age'].values).reshape(-1, 1)

n_bins = []
age_lim = []
n = 1000
for i in range(n):
    transformer = MDLP(random_state=i, continuous_features=None)
    age_dis = transformer.fit_transform(age, survived)
    age_bins = transformer.cat2intervals(age_dis, 0)
    n_bins.append(len(set(age_bins)))
    if len(set(age_bins)) == 2: age_lim.append(age_bins[0])
    elif len(set(age_bins)) > 2:
        print('\t ! more than two bins, n=', len(set(age_bins)))

print('* estimated N bins:', set(n_bins))
print('\t mean', np.mean(1. * np.array(n_bins)))
print('* Age thresholds, frequencies')
lim_val = np.array(age_lim)[:, 0]

sum_not_inf = 0
for val_i in set(lim_val):
    print('\t', val_i, (1. * sum(lim_val == val_i)) / n)
    sum_not_inf = sum_not_inf + sum(lim_val == val_i)
print('\t', 'inf', (n - sum_not_inf) / n)