def regions_to_tree_improved(self,
                                 features_df,
                                 labels_df,
                                 regions,
                                 features,
                                 feature_mins,
                                 feature_maxs,
                                 max_samples=1):

        lines = self.find_lines(regions, features, feature_mins, feature_maxs)
        lines_keys = [key for key in lines.keys() if len(lines[key]) > 0]
        if lines is None or len(lines) <= 0 or len(lines_keys) <= 0:
            return DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                value=None,
                                data=features_df)

        random_label = np.random.choice(lines_keys)
        random_value = np.random.choice(lines[random_label])
        data = DataFrame(features_df)
        data['cat'] = labels_df
        best_split_node = DecisionTree(
            data=data,
            label=random_label,
            value=random_value,
            left=DecisionTree(data=data[data[random_label] <= random_value]),
            right=DecisionTree(data=data[data[random_label] > random_value]))
        node = DecisionTree(label=best_split_node.label,
                            value=best_split_node.value,
                            data=best_split_node.data)

        feature_mins_right = feature_mins.copy()
        feature_mins_right[node.label] = node.value
        feature_maxs_left = feature_maxs.copy()
        feature_maxs_left[node.label] = node.value
        regions_left = []
        regions_right = []
        for region in regions:
            if region[best_split_node.label][0] < best_split_node.value:
                regions_left.append(region)
            else:
                regions_right.append(region)
        if len(best_split_node.left.data) >= max_samples and len(
                best_split_node.right.data) >= max_samples:
            node.left = self.regions_to_tree_improved(
                best_split_node.left.data.drop('cat', axis=1),
                best_split_node.left.data[['cat']], regions_left, features,
                feature_mins, feature_maxs_left)
            node.right = self.regions_to_tree_improved(
                best_split_node.right.data.drop('cat', axis=1),
                best_split_node.right.data[['cat']], regions_right, features,
                feature_mins_right, feature_maxs)

        else:
            node.label = str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int))))
            node.value = None

        return node
Exemple #2
0
def _convert_to_tree(dt, features):
    """Convert a sklearn object to a `decisiontree.decisiontree` object"""
    n_nodes = dt.tree_.node_count
    children_left = dt.tree_.children_left
    children_right = dt.tree_.children_right
    feature = dt.tree_.feature
    threshold = dt.tree_.threshold
    classes = dt.classes_

    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes)
    decision_trees = [None] * n_nodes
    for i in range(n_nodes):
        decision_trees[i] = DecisionTree()
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    for i in range(n_nodes):

        if children_left[i] > 0:
            decision_trees[i].left = decision_trees[children_left[i]]

        if children_right[i] > 0:
            decision_trees[i].right = decision_trees[children_right[i]]

        if is_leaves[i]:
            decision_trees[i].label = dt.classes_[np.argmax(
                dt.tree_.value[i][0])]
            decision_trees[i].value = None
        else:
            decision_trees[i].label = features[feature[i]]
            decision_trees[i].value = threshold[i]

    return decision_trees[0]
    def construct_tree(self, training_feature_vectors, labels, current_depth=0):
        # First find the best split feature
        feature, type = self.find_split_feature(training_feature_vectors.copy(), labels.copy())

        # Can be removed later
        if len(labels) == 0:
            return DecisionTree(label=self.default, value=None, data=None)

        data = DataFrame(training_feature_vectors.copy())
        data['cat'] = labels

        # Only pre-pruning enabled at this moment (QUEST already has very nice trees)
        if feature is None or len(data) == 0 or len(training_feature_vectors.index) <= self.max_nr_nodes \
                or len(np.unique(data['cat'])) == 1 or self.all_feature_vectors_equal(training_feature_vectors)\
                or current_depth >= self.max_depth:
            # Create leaf with label most occurring class
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)

        # If we don't need to pre-prune, we calculate the best possible splitting point for the best split feature
        split_point = self.find_best_split_point(data.copy(), feature, type)

        if split_point is None or math.isnan(split_point):
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)


        # Divide the data using this best split feature and value and call recursively
        split_node = self.divide_data(data.copy(), feature, split_point)
        if len(split_node.left.data) == 0 or len(split_node.right.data) == 0:
            label = np.argmax(np.bincount(data['cat'].values.astype(int)))
            return DecisionTree(label=label.astype(str), value=None, data=data)
        node = DecisionTree(label=split_node.label, value=split_node.value, data=split_node.data)
        node.left = self.construct_tree(split_node.left.data.drop('cat', axis=1),
                                        split_node.left.data[['cat']], current_depth+1)
        node.right = self.construct_tree(split_node.right.data.drop('cat', axis=1),
                                         split_node.right.data[['cat']], current_depth+1)

        return node
    def decision_tree_from_text(self, lines):

        dt = DecisionTree()

        if '<=' in lines[0] or '>' in lines[0]:
            # Intermediate node
            node_name = lines[0].split(':')[0].lstrip()
            label, value = lines[0].split(':')[1].split('<=')
            label = ' '.join(label.lstrip().rstrip().split('.'))
            value = value.lstrip().split()[0]
            dt.label = label
            dt.value = float(value)
            dt.left = self.decision_tree_from_text(lines[1:])
            counter = 1
            while lines[counter].split(':')[0].lstrip() != node_name:
                counter += 1
            dt.right = self.decision_tree_from_text(lines[counter + 1:])
        else:
            # Terminal node
            dt.label = int(eval(lines[0].split(':')[1].lstrip()))

        return dt
Exemple #5
0
    def regions_to_tree_improved(self,
                                 features_df,
                                 labels_df,
                                 regions,
                                 features,
                                 feature_mins,
                                 feature_maxs,
                                 max_samples=5):
        lines = self.find_lines(regions, features, feature_mins, feature_maxs)
        # print "Found ", len(lines), " lines for ", len(regions), " regions and ", len(features), " features in ", (end-start), " seconds"
        # print lines

        if lines is None or len(lines) <= 0:
            return DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                value=None,
                                data=features_df)

        info_gains = self.calculate_info_gains(lines, features_df, labels_df)
        # print "Calculated info gains ", len(lines), " features and ", len(features_df), " samples in ", (end-start), " seconds"
        # print info_gains

        if len(info_gains) > 0:
            best_split_node = max(info_gains.items(),
                                  key=operator.itemgetter(1))[0]
            node = DecisionTree(label=best_split_node.label,
                                value=best_split_node.value,
                                data=best_split_node.data)
        else:
            node = DecisionTree(label=str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int)))),
                                data=features_df,
                                value=None)
            return node
        # print node.label, node.value

        ##########################################################################

        # We call recursively with the splitted data and set the bounds of feature f
        # for left child: set upper bounds to the value of the chosen line
        # for right child: set lower bounds to value of chosen line
        feature_mins_right = feature_mins.copy()
        feature_mins_right[node.label] = node.value
        feature_maxs_left = feature_maxs.copy()
        feature_maxs_left[node.label] = node.value
        regions_left = []
        regions_right = []
        for region in regions:
            if region[best_split_node.label][0] < best_split_node.value:
                regions_left.append(region)
            else:
                regions_right.append(region)
        if len(regions_left) >= max_samples or len(
                regions_right) >= max_samples:
            node.left = self.regions_to_tree_improved(
                best_split_node.left.data.drop('cat', axis=1),
                best_split_node.left.data[['cat']], regions_left, features,
                feature_mins, feature_maxs_left)
            node.right = self.regions_to_tree_improved(
                best_split_node.right.data.drop('cat', axis=1),
                best_split_node.right.data[['cat']], regions_right, features,
                feature_mins_right, feature_maxs)
        else:
            node.label = str(
                np.argmax(np.bincount(labels_df['cat'].values.astype(int))))
            node.value = None

        return node