Esempio n. 1
0
class TreeKnnClassifier(abstract_classifier):
    def __init__(self, criterion, depth, k, samples, labels):
        self.tree = DecisionTreeClassifier(criterion=criterion,
                                           max_depth=depth)
        self.tree.fit(X=samples, y=labels)

        factory = knn_factory(k)
        train_sets = self.getLeafIDs(samples, labels)
        self.knn_classifiers = {
            leaf: factory.train(train_sets[leaf]['samples'],
                                train_sets[leaf]['labels'],
                                train_sets[leaf]['weights'])
            for leaf in train_sets.keys()
        }

    def getLeafIDs(self, data, labels):
        train_sets = {}
        for sample, label in zip(data, labels):
            leaf = self.getLeaf(sample)
            try:
                train_sets[leaf]['samples'].append(sample)
                train_sets[leaf]['labels'].append(label)

            except KeyError:
                train_sets[leaf] = {
                    'samples': [sample],
                    'labels': [label],
                    'weights': self.generateWeights(sample)
                }

        return train_sets

    def getLeaf(self, sample):
        return self.tree.decision_path(sample.reshape(1, -1)).indices[-1]

    def generateWeights(self, sample):
        weights = np.ones(sample.shape)
        for node in self.tree.decision_path(sample.reshape(1,
                                                           -1)).indices[:-1]:
            weights[self.tree.tree_.feature[node]] = 0

        return weights

    def classify(self, sample):
        """
        Finds relevant knn classifier, and returns it's result on the given sample.
        :param sample:
        :return:
        """
        return self.knn_classifiers[self.getLeaf(sample)].classify(sample)
Esempio n. 2
0
class DecisionTreeModel:
    # initialize a DecisionTreeModel object with "model" attribute containing an actual DecisionTreeClassifier object from the skLearn module
    def __init__(self,*args,**kwargs):
        self.model = DecisionTreeClassifier(*args, **kwargs)

    def get_model(self):
        return self.model

    def apply(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.apply(X,check_input)

    def cost_complexity_pruning_path(self,X,y,sample_weight=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        return self.model.cost_complexity_pruning_path(X,y,sample_weight)        
    def decision_path(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.decision_path(X,check_input)
    
    def fit(self,X,y,sample_weight=None,check_input=True,X_idx_sorted=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        self.model.fit(X,y,sample_weight,check_input,X_idx_sorted)
        return self

    def predict(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict(X,check_input)

    def predict_log_proba(self,X):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict_log_proba(X)

    def predict_proba(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict_proba(X,check_input)

    def score(self,X,y,sample_weight=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        return self.model.score(X,y,sample_weight)

    def __getattribute__(self,item):
        try:
            return super().__getattribute__(item)
        except:
            pass;
        return getattr(self.model,item)
Esempio n. 3
0
def get_decision_paths(model: tree.DecisionTreeClassifier, data, selection):
    selected_rows = data.loc[selection.astype(bool), :]

    d_path = model.decision_path(selected_rows)
    paths = set()

    leaf_id = model.apply(selected_rows)
    feature = model.tree_.feature
    threshold = model.tree_.threshold

    for sample_id in range(len(selected_rows.index)):
        node_idx = d_path.indices[d_path.indptr[sample_id]:d_path.
                                  indptr[sample_id + 1]]

        rules = []

        for node_id in node_idx:
            if leaf_id[sample_id] == node_id:
                continue

            sign = None
            if selected_rows.iloc[sample_id,
                                  feature[node_id]] <= threshold[node_id]:
                sign = " <= "
            else:
                sign = " >= "

            rule = (data.columns[feature[node_id]] + sign +
                    str(round(threshold[node_id], 2)))
            rules.append(rule)
        paths.add(tuple(rules))

    paths = [[rule for rule in path] for path in paths]
    return paths
Esempio n. 4
0
    def test_decisiontree_classifier_decision_path_leaf(self):
        model = DecisionTreeClassifier(max_depth=2)
        X, y = make_classification(10, n_features=4, random_state=42)
        X = X[:, :2]
        model.fit(X, y)
        initial_types = [('input', FloatTensorType((None, X.shape[1])))]
        model_onnx = convert_sklearn(model,
                                     initial_types=initial_types,
                                     options={
                                         id(model): {
                                             'decision_leaf': True,
                                             'decision_path': True,
                                             'zipmap': False
                                         }
                                     },
                                     target_opset=TARGET_OPSET)
        sess = InferenceSession(model_onnx.SerializeToString())
        res = sess.run(None, {'input': X.astype(np.float32)})
        pred = model.predict(X)
        assert_almost_equal(pred, res[0].ravel())
        prob = model.predict_proba(X)
        assert_almost_equal(prob, res[1])

        dec = model.decision_path(X)

        exp_path = binary_array_to_string(dec.todense())
        exp_leaf = path_to_leaf(model.tree_, dec.todense())
        assert exp_path == res[2].ravel().tolist()
        assert exp_leaf.tolist() == res[3].ravel().tolist()
def decision_plot(new_X_train2, new_y_train2, feature_names, test, model,
                  classify):
    dt = DecisionTreeClassifier(random_state=0,
                                criterion='entropy',
                                max_depth=1)

    dt.fit(new_X_train2, new_y_train2)
    if classify == 'rf':
        print("Decision Tree Predicts for Instance:" + str(dt.predict(test)) +
              " and Random Forests predicted:" + str(model.predict(test)))
    elif classify == 'xg':
        print("Decision Tree Predicts for Instance:" + str(dt.predict(test)) +
              " and XGboost predicted:" + str(model.predict(test)))

    fidelityPreds = dt.predict(new_X_train2)
    print("Let's see fidelity", accuracy_score(new_y_train2, fidelityPreds))

    graph = Source(
        export_graphviz(dt,
                        out_file=None,
                        feature_names=feature_names,
                        class_names=dt.classes_,
                        filled=True))
    display(SVG(graph.pipe(format='svg')))
    print("Lets find out the path for this specific instance!")
    for i in dt.decision_path(test):
        print(i)
    return dt
Esempio n. 6
0
def demoOne():
    dataSet, labels = getDataSet()
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(dataSet, labels)
    treePlot(clf)
    print(clf.tree_.max_depth)
    print(clf.decision_path([[0, 0]]))
    print(clf.get_params())
    print(clf.predict_proba([[0, 0]]))
Esempio n. 7
0
def main() -> None:
    iris = load_iris()
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(iris.data, iris.target)

    # sepal length, sepal width, petal length, petal width
    X = np.array([[5.0, 2.9, 1.0, 4.85]])
    #X = np.array([[5.0, 2.9, 1.0, 0.2]])
    print(clf.predict(X))
    print(clf.decision_path(X))
    print(clf.decision_path(X).todense())
    with open('iris-dtree.dot', mode='w') as f:
        export_graphviz(clf,
                        out_file=f,
                        rounded=True,
                        feature_names=iris.feature_names,
                        class_names=iris.target_names,
                        special_characters=True)
Esempio n. 8
0
def getPath(X, input, conf, model):
    """
    Get the path from Local Interpretable Model-agnostic Explanation Tree
    :param X: the whole inputs
    :param sess: TF session
    :param x: input placeholder
    :param preds: the model's symbolic output
    :param input: instance to interpret
    :param conf: the configuration of dataset
    :return: the path for the decision of given instance
    """

    # use the original implementation of LIME
    explainer = lime_tabular.LimeTabularExplainer(
        X,
        feature_names=conf.feature_name,
        class_names=conf.class_name,
        categorical_features=conf.categorical_features,
        discretize_continuous=True)
    g_data = explainer.generate_instance(input, num_samples=5000)
    #print(g_data.shape)
    #g_labels = model_argmax(sess, x, preds, g_data)
    g_labels = model.predict(g_data)
    '''
    with open('CexSet.csv', 'a', newline='') as csvfile:
        writer = cv.writer(csvfile)
        writer.writerows(g_data)
    '''

    # build the interpretable tree
    tree = DecisionTreeClassifier(
        random_state=2019)  #min_samples_split=0.05, min_samples_leaf =0.01
    tree.fit(g_data, g_labels)

    # get the path for decision
    path_index = tree.decision_path(np.array([input])).indices
    path = []
    for i in range(len(path_index)):
        node = path_index[i]
        i = i + 1
        f = tree.tree_.feature[node]
        if f != -2:
            left_count = tree.tree_.n_node_samples[
                tree.tree_.children_left[node]]
            right_count = tree.tree_.n_node_samples[
                tree.tree_.children_right[node]]
            left_confidence = 1.0 * left_count / (left_count + right_count)
            right_confidence = 1.0 - left_confidence
            if tree.tree_.children_left[node] == path_index[i]:
                path.append(
                    [f, "<=", tree.tree_.threshold[node], left_confidence])
            else:
                path.append(
                    [f, ">", tree.tree_.threshold[node], right_confidence])
    return path
Esempio n. 9
0
 def test_decisiontreeclassifier_decision_path(self):
     model = DecisionTreeClassifier(max_depth=2)
     X, y = make_classification(10, n_features=4, random_state=42)
     X = X[:, :2].astype(numpy.float32)
     model.fit(X, y)
     model_onnx = to_onnx(
         model,
         X,
         options={id(model): {
                      'decision_path': True,
                      'zipmap': False
                  }})
     sess = OnnxInference(model_onnx)
     res = sess.run({'X': X})
     pred = model.predict(X)
     self.assertEqualArray(pred, res['label'].ravel())
     prob = model.predict_proba(X)
     self.assertEqualArray(prob, res['probabilities'])
     dec = model.decision_path(X)
     exp = binary_array_to_string(dec.todense())
     self.assertEqual(exp, res['decision_path'].ravel().tolist())
Esempio n. 10
0
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin,
                                 TransformerMixin):
    """ A transformer that generalizes data to representative points.

    Learns data generalizations based on an original model's predictions
    and a target accuracy. Once the generalizations are learned, can
    receive one or more data records and transform them to representative
    points based on the learned generalization.

    An alternative way to use the transformer is to supply ``cells`` and
    ``features`` in init or set_params and those will be used to transform
    data to representatives. In this case, fit must still be called but
    there is no need to supply it with ``X`` and ``y``, and there is no
    need to supply an existing ``estimator`` to init.

    In summary, either ``estimator`` and ``target_accuracy`` should be
    supplied or ``cells`` and ``features`` should be supplied.

    Parameters
    ----------
    estimator : estimator, optional
        The original model for which generalization is being performed.
        Should be pre-fitted.

    target_accuracy : float, optional
        The required accuracy when applying the base model to the
        generalized data. Accuracy is measured relative to the original
        accuracy of the model.

    features : list of str, optional
        The feature names, in the order that they appear in the data.

    cells : list of object, optional
        The cells used to generalize records. Each cell must define a
        range or subset of categories for each feature, as well as a
        representative value for each feature.
        This parameter should be used when instantiating a transformer
        object without first fitting it.

    Attributes
    ----------
    cells_ : list of object
        The cells used to generalize records, as learned when calling fit.

    ncp_ : float
        The NCP (information loss) score of the resulting generalization,
        as measured on the training data.

    generalizations_ : object
        The generalizations that were learned (actual feature ranges).

    Notes
    -----


    """
    def __init__(self,
                 estimator=None,
                 target_accuracy=0.998,
                 features=None,
                 cells=None):
        self.estimator = estimator
        self.target_accuracy = target_accuracy
        self.features = features
        self.cells = cells

    def get_params(self, deep=True):
        """Get parameters for this estimator.

        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and contained
            subobjects that are estimators.

        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """
        ret = {}
        ret['target_accuracy'] = self.target_accuracy
        if deep:
            ret['features'] = copy.deepcopy(self.features)
            ret['cells'] = copy.deepcopy(self.cells)
            ret['estimator'] = self.estimator
        else:
            ret['features'] = copy.copy(self.features)
            ret['cells'] = copy.copy(self.cells)
        return ret

    def set_params(self, **params):
        """Set the parameters of this estimator.

        Returns
        -------
        self : object
            Returns self.
        """
        if 'target_accuracy' in params:
            self.target_accuracy = params['target_accuracy']
        if 'features' in params:
            self.features = params['features']
        if 'cells' in params:
            self.cells = params['cells']
        return self

    def fit_transform(self, X=None, y=None):
        """Learns the generalizations based on training data, and applies them to the data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
            The training input samples.
        y : array-like, shape (n_samples,), optional
            The target values. An array of int.
            This should contain the predictions of the original model on ``X``.

        Returns
        -------
        self : object
            Returns self.
        """
        self.fit(X, y)
        return self.transform(X)

    def fit(self, X=None, y=None):
        """Learns the generalizations based on training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
            The training input samples.
        y : array-like, shape (n_samples,), optional
            The target values. An array of int.
            This should contain the predictions of the original model on ``X``.

        Returns
        -------
        X_transformed : ndarray, shape (n_samples, n_features)
            The array containing the representative values to which each record in
            ``X`` is mapped.
        """

        # take into account that estimator, X, y, cells, features may be None

        if X is not None and y is not None:
            X, y = check_X_y(X, y, accept_sparse=True)
            self.n_features_ = X.shape[1]
        elif self.features:
            self.n_features_ = len(self.features)
        else:
            self.n_features_ = 0

        if self.features:
            self._features = self.features
        # if features is None, use numbers instead of names
        elif self.n_features_ != 0:
            self._features = [i for i in range(self.n_features_)]
        else:
            self._features = None

        if self.cells:
            self.cells_ = self.cells
        else:
            self.cells_ = {}

        # Going to fit
        # (currently not dealing with option to fit with only X and y and no estimator)
        if self.estimator and X is not None and y is not None:
            # divide dataset into train and test
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, stratify=y, test_size=0.4, random_state=18)

            # collect feature data (such as min, max)
            train_data = pd.DataFrame(X_train, columns=self._features)
            feature_data = {}
            for feature in self._features:
                if not feature in feature_data.keys():
                    values = list(train_data.loc[:, feature])
                    fd = {}
                    fd['min'] = min(values)
                    fd['max'] = max(values)
                    feature_data[feature] = fd

            self.cells_ = {}
            self.dt_ = DecisionTreeClassifier(random_state=0,
                                              min_samples_split=2,
                                              min_samples_leaf=1)
            self.dt_.fit(X_train, y_train)
            self._calculate_cells()
            self._modify_cells()
            nodes = self._get_nodes_level(0)
            self._attach_cells_representatives(X_train, y_train, nodes)
            # self.cells_ currently holds the generalization created from the tree leaves

            # apply generalizations to test data
            generalized = self._generalize(X_test, nodes, self.cells_,
                                           self.cells_by_id_)

            # check accuracy
            accuracy = self.estimator.score(generalized, y_test)
            print('Initial accuracy is %f' % accuracy)

            # if accuracy above threshold, improve generalization
            if accuracy > self.target_accuracy:
                level = 1
                while accuracy > self.target_accuracy:
                    nodes = self._get_nodes_level(level)
                    self._calculate_level_cells(level)
                    self._attach_cells_representatives(X_train, y_train, nodes)
                    generalized = self._generalize(X_test, nodes, self.cells_,
                                                   self.cells_by_id_)
                    accuracy = self.estimator.score(generalized, y_test)
                    print('Level: %d, accuracy: %f' % (level, accuracy))
                    level += 1

            # if accuracy below threshold, improve accuracy by removing features from generalization
            if accuracy < self.target_accuracy:
                while accuracy < self.target_accuracy:
                    self._calculate_generalizations()
                    removed_feature = self._remove_feature_from_generalization(
                        X_test, nodes, y_test, feature_data)
                    if not removed_feature:
                        break
                    generalized = self._generalize(X_test, nodes, self.cells_,
                                                   self.cells_by_id_)
                    accuracy = self.estimator.score(generalized, y_test)
                    print('Removed feature: %s, accuracy: %f' %
                          (removed_feature, accuracy))

            # self.cells_ currently holds the chosen generalization based on target accuracy

            # calculate iLoss
            self.ncp_ = self._calculate_ncp(X_test, self.generalizations_,
                                            feature_data)

        # Return the transformer
        return self

    def transform(self, X):
        """ Transforms data records to representative points.

        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        X_transformed : ndarray, shape (n_samples, n_features)
            The array containing the representative values to which each record in
            ``X`` is mapped.
        """

        # Check if fit has been called
        msg = 'This %(name)s instance is not initialized yet. ' \
              'Call ‘fit’ or ‘set_params’ with ' \
              'appropriate arguments before using this method.'
        check_is_fitted(self, ['cells', 'features'], msg=msg)

        # Input validation
        X = check_array(X, accept_sparse=True)
        if X.shape[1] != self.n_features_ and self.n_features_ != 0:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')

        if not self._features:
            self._features = [i for i in range(X.shape[1])]

        representatives = pd.DataFrame(columns=self._features)  # only columns
        generalized = pd.DataFrame(X, columns=self._features,
                                   copy=True)  # original data
        mapped = np.zeros(X.shape[0])  # to mark records we already mapped

        # iterate over cells (leaves in decision tree)
        for i in range(len(self.cells_)):
            # Copy the representatives from the cells into another data structure:
            # iterate over features in test data
            for feature in self._features:
                # if feature has a representative value in the cell and should not
                # be left untouched, take the representative value
                if feature in self.cells_[i]['representative'] and \
                        ( 'untouched' not in self.cells_[i] \
                        or feature not in self.cells_[i]['untouched'] ):
                    representatives.loc[
                        i, feature] = self.cells_[i]['representative'][feature]
                # else, drop the feature (removes from representatives columns that
                # do not have a representative value or should remain untouched)
                elif feature in representatives.columns.tolist():
                    representatives = representatives.drop(feature, axis=1)

            # get the indexes of all records that map to this cell
            indexes = self._get_record_indexes_for_cell(
                X, self.cells_[i], mapped)

            # replace the values in the representative columns with the representative
            # values (leaves others untouched)
            if not representatives.columns.empty:
                generalized.loc[
                    indexes,
                    representatives.columns] = representatives.loc[i].values

        return generalized.to_numpy()

    def _get_record_indexes_for_cell(self, X, cell, mapped):
        return [
            i for i, x in enumerate(X)
            if not mapped.item(i) and self._cell_contains(cell, x, i, mapped)
        ]

    def _cell_contains(self, cell, x, i, mapped):
        for f in self._features:
            if f in cell['ranges']:
                if not self._cell_contains_numeric(f, cell['ranges'][f], x):
                    return False
            else:
                #TODO: exception - feature not defined
                pass
        # Mark as mapped
        mapped.itemset(i, 1)
        return True

    def _cell_contains_numeric(self, f, range, x):
        i = self._features.index(f)
        # convert x to ndarray to allow indexing
        a = np.array(x)
        value = a.item(i)
        if range['start']:
            if value <= range['start']:
                return False
        if range['end']:
            if value > range['end']:
                return False
        return True

    def _calculate_cells(self):
        self.cells_by_id_ = {}
        self.cells_ = self._calculate_cells_recursive(0)

    def _calculate_cells_recursive(self, node):
        feature_index = self.dt_.tree_.feature[node]
        if feature_index == -2:
            # this is a leaf
            label = self._calculate_cell_label(node)
            hist = [int(i) for i in self.dt_.tree_.value[node][0]]
            cell = {
                'label': label,
                'hist': hist,
                'ranges': {},
                'id': int(node)
            }
            return [cell]

        cells = []
        feature = self._features[feature_index]
        threshold = self.dt_.tree_.threshold[node]
        left_child = self.dt_.tree_.children_left[node]
        right_child = self.dt_.tree_.children_right[node]

        left_child_cells = self._calculate_cells_recursive(left_child)
        for cell in left_child_cells:
            if feature not in cell['ranges'].keys():
                cell['ranges'][feature] = {'start': None, 'end': None}
            if cell['ranges'][feature]['end'] is None:
                cell['ranges'][feature]['end'] = threshold
            cells.append(cell)
            self.cells_by_id_[cell['id']] = cell

        right_child_cells = self._calculate_cells_recursive(right_child)
        for cell in right_child_cells:
            if feature not in cell['ranges'].keys():
                cell['ranges'][feature] = {'start': None, 'end': None}
            if cell['ranges'][feature]['start'] is None:
                cell['ranges'][feature]['start'] = threshold
            cells.append(cell)
            self.cells_by_id_[cell['id']] = cell

        return cells

    def _calculate_cell_label(self, node):
        label_hist = self.dt_.tree_.value[node][0]
        return int(self.dt_.classes_[np.argmax(label_hist)])

    def _modify_cells(self):
        cells = []
        for cell in self.cells_:
            new_cell = {
                'id': cell['id'],
                'label': cell['label'],
                'ranges': {},
                'categories': {},
                'hist': cell['hist'],
                'representative': None
            }
            for feature in self._features:
                if feature in cell['ranges'].keys():
                    new_cell['ranges'][feature] = cell['ranges'][feature]
                else:
                    new_cell['ranges'][feature] = {'start': None, 'end': None}
            cells.append(new_cell)
            self.cells_by_id_[new_cell['id']] = new_cell
        self.cells_ = cells

    def _calculate_level_cells(self, level):
        if level < 0 or level > self.dt_.get_depth():
            #TODO: exception 'Illegal level %d' % level
            pass

        if level > 0:
            new_cells = []
            new_cells_by_id = {}
            nodes = self._get_nodes_level(level)
            for node in nodes:
                if self.dt_.tree_.feature[node] == -2:  # leaf node
                    new_cell = self.cells_by_id_[node]
                else:
                    left_child = self.dt_.tree_.children_left[node]
                    right_child = self.dt_.tree_.children_right[node]
                    left_cell = self.cells_by_id_[left_child]
                    right_cell = self.cells_by_id_[right_child]
                    new_cell = {
                        'id': int(node),
                        'ranges': {},
                        'categories': {},
                        'label': None,
                        'representative': None
                    }
                    for feature in left_cell['ranges'].keys():
                        new_cell['ranges'][feature] = {}
                        new_cell['ranges'][feature]['start'] = left_cell[
                            'ranges'][feature]['start']
                        new_cell['ranges'][feature]['end'] = right_cell[
                            'ranges'][feature]['start']
                    for feature in left_cell['categories'].keys():
                        new_cell['categories'][feature] = \
                            list(set(left_cell['categories'][feature]) |
                                 set(right_cell['categories'][feature]))
                    self._calculate_level_cell_label(left_cell, right_cell,
                                                     new_cell)
                new_cells.append(new_cell)
                new_cells_by_id[new_cell['id']] = new_cell
            self.cells_ = new_cells
            self.cells_by_id_ = new_cells_by_id
        # else: nothing to do, stay with previous cells

    def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
        new_cell['hist'] = [
            x + y for x, y in zip(left_cell['hist'], right_cell['hist'])
        ]
        new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])])

    def _get_nodes_level(self, level):
        # level = distance from lowest leaf
        node_depth = np.zeros(shape=self.dt_.tree_.node_count, dtype=np.int64)
        is_leaves = np.zeros(shape=self.dt_.tree_.node_count, dtype=bool)
        stack = [(0, -1)]  # seed is the root node id and its parent depth
        while len(stack) > 0:
            node_id, parent_depth = stack.pop()
            node_depth[node_id] = parent_depth + 1

            if self.dt_.tree_.children_left[
                    node_id] != self.dt_.tree_.children_right[node_id]:
                stack.append(
                    (self.dt_.tree_.children_left[node_id], parent_depth + 1))
                stack.append(
                    (self.dt_.tree_.children_right[node_id], parent_depth + 1))
            else:
                is_leaves[node_id] = True

        max_depth = max(node_depth)
        depth = max_depth - level
        if depth < 0:
            return None
        return [
            i for i, x in enumerate(node_depth)
            if x == depth or (x < depth and is_leaves[i])
        ]

    def _attach_cells_representatives(self, samples, labels, level_nodes):
        samples_df = pd.DataFrame(samples, columns=self._features)
        labels_df = pd.DataFrame(labels, columns=['label'])
        samples_node_ids = self._find_sample_nodes(samples_df, level_nodes)
        for cell in self.cells_:
            cell['representative'] = {}
            # get all rows in cell
            indexes = [
                i for i, x in enumerate(samples_node_ids) if x == cell['id']
            ]
            sample_rows = samples_df.iloc[indexes]
            sample_labels = labels_df.iloc[indexes]['label'].values.tolist()
            # get rows with matching label
            indexes = [
                i for i, label in enumerate(sample_labels)
                if label == cell['label']
            ]
            match_samples = sample_rows.iloc[indexes]
            # find the "middle" of the cluster
            array = match_samples.values
            median = np.median(array, axis=0)
            # find the record closest to the median
            i = 0
            min = len(array)
            min_dist = float("inf")
            for row in array:
                dist = distance.euclidean(row, median)
                if dist < min_dist:
                    min_dist = dist
                    min = i
                i = i + 1
            row = match_samples.iloc[min]
            # use its values as the representative
            for feature in cell['ranges'].keys():
                cell['representative'][feature] = row[feature].item()

    def _find_sample_nodes(self, samples, nodes):
        paths = self.dt_.decision_path(samples).toarray()
        nodeSet = set(nodes)
        return [(list(set([i
                           for i, v in enumerate(p) if v == 1]) & nodeSet))[0]
                for p in paths]

    def _generalize(self, data, level_nodes, cells, cells_by_id):
        representatives = pd.DataFrame(
            columns=self._features)  # empty except for columns
        generalized = pd.DataFrame(data, columns=self._features,
                                   copy=True)  # original data
        mapping_to_cells = self._map_to_cells(generalized, level_nodes,
                                              cells_by_id)
        # iterate over cells (leaves in decision tree)
        for i in range(len(cells)):
            # This code just copies the representatives from the cells into another data structure
            # iterate over features
            for feature in self._features:
                # if feature has a representative value in the cell and should not be left untouched,
                # take the representative value
                if feature in cells[i]['representative'] and (
                        'untouched' not in cells[i]
                        or feature not in cells[i]['untouched']):
                    representatives.loc[
                        i, feature] = cells[i]['representative'][feature]
                # else, drop the feature (removes from representatives columns that do not have a
                # representative value or should remain untouched)
                elif feature in representatives.columns.tolist():
                    representatives = representatives.drop(feature, axis=1)

            # get the indexes of all records that map to this cell
            indexes = [
                j for j in range(len(mapping_to_cells))
                if mapping_to_cells[j]['id'] == cells[i]['id']
            ]
            # replaces the values in the representative columns with the representative values
            # (leaves others untouched)
            if not representatives.columns.empty:
                generalized.loc[
                    indexes,
                    representatives.columns] = representatives.loc[i].values

        return generalized.to_numpy()

    def _map_to_cells(self, samples, nodes, cells_by_id):
        mapping_to_cells = []
        for index, row in samples.iterrows():
            cell = self._find_sample_cells([row], nodes, cells_by_id)[0]
            mapping_to_cells.append(cell)
        return mapping_to_cells

    def _find_sample_cells(self, samples, nodes, cells_by_id):
        node_ids = self._find_sample_nodes(samples, nodes)
        return [cells_by_id[nodeId] for nodeId in node_ids]

    def _remove_feature_from_generalization(self, samples, nodes, labels,
                                            feature_data):
        feature = self._get_feature_to_remove(samples, nodes, labels,
                                              feature_data)
        if not feature:
            return None
        GeneralizeToRepresentative._remove_feature_from_cells(
            self.cells_, self.cells_by_id_, feature)
        return feature

    def _get_feature_to_remove(self, samples, nodes, labels, feature_data):
        # We want to remove features with low iLoss (NCP) and high accuracy gain
        # (after removing them)
        ranges = self.generalizations_['ranges']
        range_counts = self._find_range_count(samples, ranges)
        total = samples.size
        range_min = sys.float_info.max
        remove_feature = None

        for feature in ranges.keys():
            if feature not in self.generalizations_['untouched']:
                feature_ncp = self._calc_ncp_numeric(ranges[feature],
                                                     range_counts[feature],
                                                     feature_data[feature],
                                                     total)
                if feature_ncp > 0:
                    # divide by accuracy gain
                    new_cells = copy.deepcopy(self.cells_)
                    cells_by_id = copy.deepcopy(self.cells_by_id_)
                    GeneralizeToRepresentative._remove_feature_from_cells(
                        new_cells, cells_by_id, feature)
                    generalized = self._generalize(samples, nodes, new_cells,
                                                   cells_by_id)
                    accuracy = self.estimator.score(generalized, labels)
                    feature_ncp = feature_ncp / accuracy
                if feature_ncp < range_min:
                    range_min = feature_ncp
                    remove_feature = feature

        print('feature to remove: ' +
              (remove_feature if remove_feature else ''))
        return remove_feature

    def _calculate_generalizations(self):
        self.generalizations_ = {
            'ranges':
            GeneralizeToRepresentative._calculate_ranges(self.cells_),
            'untouched':
            GeneralizeToRepresentative._calculate_untouched(self.cells_)
        }

    def _find_range_count(self, samples, ranges):
        samples_df = pd.DataFrame(samples, columns=self._features)
        range_counts = {}
        last_value = None
        for r in ranges.keys():
            range_counts[r] = []
            # if empty list, all samples should be counted
            if not ranges[r]:
                range_counts[r].append(samples_df.shape[0])
            else:
                for value in ranges[r]:
                    range_counts[r].append(
                        len(samples_df.loc[samples_df[r] <= value]))
                    last_value = value
                range_counts[r].append(
                    len(samples_df.loc[samples_df[r] > last_value]))
        return range_counts

    def _calculate_ncp(self, samples, generalizations, feature_data):
        # supressed features are already taken care of within _calc_ncp_numeric
        ranges = generalizations['ranges']
        range_counts = self._find_range_count(samples, ranges)
        total = samples.shape[0]
        total_ncp = 0
        total_features = len(generalizations['untouched'])
        for feature in ranges.keys():
            feature_ncp = GeneralizeToRepresentative._calc_ncp_numeric(
                ranges[feature], range_counts[feature], feature_data[feature],
                total)
            total_ncp = total_ncp + feature_ncp
            total_features += 1
        if total_features == 0:
            return 0
        return total_ncp / total_features

    @staticmethod
    def _calculate_ranges(cells):
        ranges = {}
        for cell in cells:
            for feature in [
                    key for key in cell['ranges'].keys()
                    if 'untouched' not in cell or key not in cell['untouched']
            ]:
                if feature not in ranges.keys():
                    ranges[feature] = []
                if cell['ranges'][feature]['start'] is not None:
                    ranges[feature].append(cell['ranges'][feature]['start'])
                if cell['ranges'][feature]['end'] is not None:
                    ranges[feature].append(cell['ranges'][feature]['end'])
        for feature in ranges.keys():
            ranges[feature] = list(set(ranges[feature]))
            ranges[feature].sort()
        return ranges

    @staticmethod
    def _calculate_untouched(cells):
        untouched_lists = [
            cell['untouched'] if 'untouched' in cell else [] for cell in cells
        ]
        untouched = set(untouched_lists[0])
        untouched = untouched.intersection(*untouched_lists)
        return list(untouched)

    @staticmethod
    def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
        # if there are no ranges, feature is supressed and iLoss is 1
        if not feature_range:
            return 1
        # range only contains the split values, need to add min and max value of feature
        # to enable computing sizes of all ranges
        new_range = [feature_data['min']
                     ] + feature_range + [feature_data['max']]
        range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
        normalized_range_sizes = [
            s * n / total for s, n in zip(range_sizes, range_count)
        ]
        average_range_size = sum(normalized_range_sizes) / len(
            normalized_range_sizes)
        return average_range_size / (feature_data['max'] - feature_data['min'])

    @staticmethod
    def _remove_feature_from_cells(cells, cells_by_id, feature):
        for cell in cells:
            if 'untouched' not in cell:
                cell['untouched'] = []
            if feature in cell['ranges'].keys():
                del cell['ranges'][feature]
            else:
                del cell['categories'][feature]
            cell['untouched'].append(feature)
            cells_by_id[cell['id']] = cell.copy()
Esempio n. 11
0
# ==============================
# out put
# ===============================
# attributes
print(dt_gini.classes_)
print(dt_gini.n_classes_)
print(dt_gini.n_features_)
print(dt_gini.n_outputs_)
print(dt_gini.feature_importances_)
print(dt_gini.max_features_)
print(dt_gini.tree_)


# methods
preds = dt_gini.predict(X_test)
preds_proba = dt_gini.predict_proba(X_test)
preds_log_proba = dt_gini.predict_log_proba(X_test)

print(preds)
print(preds_proba)
print(preds_log_proba)

accuracy = dt_gini.score(X_test, y_test)
print(accuracy)

print(dt_gini.decision_path(X_train))
print(dt_gini.decision_path(X_test))


for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print(
            "%snode=%s test node: go to node %s if X[:, %s] <= %ss else to "
            "node %s." % (node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i])
        )
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]]

print("Rules used to predict sample %s: " % sample_id)
for node_id in node_index:
    if leave_id[sample_id] != node_id:
        continue
Esempio n. 13
0
def main():
    estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
    estimator.fit(X_train, y_train)

    # Properties from the estimator
    n_nodes = estimator.tree_.node_count  # including decision nodes
    children_left = estimator.tree_.children_left  # id of the left child of the node
    # id of the right child of the node
    children_right = estimator.tree_.children_right
    feature = estimator.tree_.feature
    threshold = estimator.tree_.threshold

    # traverse the tree
    node_depth = np.zeros(n_nodes, dtype=np.int64)
    is_leaves = np.zeros(n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # If we have a test node
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    print("The binary tree structure has {} nodes and "
          "has the following tree structure:".format(n_nodes))
    for i in range(n_nodes):
        if is_leaves[i]:
            print("{}node={} leaf node.".format(node_depth[i] * "\t", i))
        else:
            print(
                "{}node={} test node: go to node {} if X[:, {}] <= {:.2f} else to "
                "node {}.".format(
                    node_depth[i] * "\t",
                    i,
                    children_left[i],
                    feature[i],
                    threshold[i],
                    children_right[i],
                ))

    node_indicator = estimator.decision_path(X_test)

    # Similarly, we can also have the leaves ids reached by each sample.
    leave_id = estimator.apply(X_test)

    # Now, it's possible to get the tests that were used to predict a sample or
    # a group of samples. First, let's make it for the sample.
    sample_id = 0
    node_index = node_indicator.indices[
        node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]

    # Each row holds the prediction value of each node
    output_value = estimator.tree_.value
    print(output_value)

    print('Rules used to predict sample {} with feature: {}: '.format(
        sample_id, ', '.join([str(feature) for feature in X_test[0]])))
    for node_id in node_index:
        if leave_id[sample_id] == node_id:
            continue

        threshold_sign = ">"
        if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
            threshold_sign = "<="

        print(
            "decision id node {} : (X_test[{}, {}] = {:.2f} {} {:.2f})".format(
                node_id, sample_id, feature[node_id], X_test[sample_id,
                                                             feature[node_id]],
                threshold_sign, threshold[node_id]))

    # Plotting graph
    dot = export_graphviz(estimator,
                          feature_names=feature_names,
                          class_names=class_names,
                          rounded=True,
                          proportion=False,
                          precision=2,
                          filled=True)
    graph = Source(dot)
    graph.view()

    print('debug line')
Esempio n. 14
0
                                 max_features='log2',
                                 max_leaf_nodes=None,
                                 min_samples_leaf=5,
                                 min_samples_split=2,
                                 min_weight_fraction_leaf=0.0,
                                 presort=False,
                                 random_state=1000,
                                 splitter='best')
    # DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
    clf = clf.fit(X, Y)
    # nt
    y_pred = clf.predict(test)
    print('Prediction : ', y_pred)
    print('Actual value : ', y_true)
    print('Testing Validation Accuracy : ', accuracy_score(y_true, y_pred))
    print('Decision Path:', clf.decision_path(X, check_input=True))
    dotfile = open("dtree2.dot", 'w')
    tree.export_graphviz(clf, out_file=dotfile)
    dotfile.close()
    # Dump the trained decision tree classifier with Pickle
    model_name = 'dtc.pkl'
    # Open the file to save as pkl file
    model_name_pkl = open(model_name, 'wb')
    pickle.dump(clf, model_name_pkl)
    # Close the pickle instances
    model_name_pkl.close()
elif (args["algotype"] == "dtr"):
    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(X, Y)
    y_pred = clf.predict(test)
    print('Prediction : ', y_pred)
Esempio n. 15
0
a = X_test[:1]

print("a:")
print(a)

#print('sk_pred: {}'.format(clf.predict(a)))
#print('true: {}'.format(y_test[:3]))

# shows the end point of the tree traverse by a sample
print("Returns the index of the leaf that each sample is predicted as:")
index_of_leaf = clf.apply(a)
print(index_of_leaf)

#decision path shows the nodes of the tree that were traverse by the sample.
print("decision path:")
d_path = clf.decision_path(a)
print(d_path)

print("nodes in the decision path:")
n_d_path = np.unique(np.sort(d_path.indices))
print(n_d_path)

print("probability of each class:")
print(clf.predict_proba(a))

print("Feature importances:")
feature_importances = clf.feature_importances_
print(feature_importances)

#accuracy -number of instance correctly classified
acsc = accuracy_score(y_test, y_pred)
# We can also retrieve the decision path of samples of interest. The
# ``decision_path`` method outputs an indicator matrix that allows us to
# retrieve the nodes the samples of interest traverse through. A non zero
# element in the indicator matrix at position ``(i, j)`` indicates that
# the sample ``i`` goes through the node ``j``. Or, for one sample ``i``, the
# positions of the non zero elements in row ``i`` of the indicator matrix
# designate the ids of the nodes that sample goes through.
#
# The leaf ids reached by samples of interest can be obtained with the
# ``apply`` method. This returns an array of the node ids of the leaves
# reached by each sample of interest. Using the leaf ids and the
# ``decision_path`` we can obtain the splitting conditions that were used to
# predict a sample or a group of samples. First, let's do it for one sample.
# Note that ``node_index`` is a sparse matrix.

node_indicator = clf.decision_path(X_test)
leaf_id = clf.apply(X_test)

sample_id = 0
# obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id`
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]

print("Rules used to predict sample {id}:\n".format(id=sample_id))
for node_id in node_index:
    # continue to the next node if it is a leaf node
    if leaf_id[sample_id] == node_id:
        continue

    # check if value of the split feature for sample 0 is below threshold
Esempio n. 17
0
def get_python(X,
               y,
               cut=jieba.cut,
               n=100,
               min_pro=0.75,
               func_name='function',
               max_depth=5,
               min_samples_leaf=50,
               max_leaf_nodes=20):
    '''

    :param X: 训练文本
    :param y: 训练标签
    :param cut: 分词器
    :param n: 返回关键词个数
    :param min_pro: 写入代码的最小概率
    :param func_name: 代码函数名称
    :param max_depth: 决策树参数->深度
    :param min_samples_leaf: 决策树参数->节点最少样本
    :param max_leaf_nodes: 决策树参数->最多叶子数
    :return:
    '''

    X = [get_word(i, cut) for i in X]
    vectorizer = CountVectorizer(max_features=3000)  # onehot的编码
    X = vectorizer.fit_transform(X).toarray()
    print('分词完毕,现在开始计算相关性...')

    PYTNON = '''def %s(s:str):
''' % (func_name)
    clf = DecisionTreeClassifier(max_depth=max_depth,
                                 min_samples_leaf=min_samples_leaf,
                                 max_leaf_nodes=max_leaf_nodes)
    clf.fit(X, y)
    print('训练完毕,现在开始寻找规则...')
    dot_data = StringIO()
    tree.export_graphviz(clf,
                         out_file=dot_data,
                         feature_names=vectorizer.get_feature_names(),
                         filled=True,
                         rounded=True,
                         special_characters=True)

    tree_info = dot_data.getvalue()

    jd = []
    lj = []
    node_info = []
    for i in tree_info.split('\n'):
        if re.search('\d+ \[label=<.*?>', i):
            jd.append(re.search('(\d+) \[label=<(.*?)<', i).group(1, 2))
        if re.search('\d+.*?value = ', i):
            node_info.append(
                re.search('(\d+).*?value = (\[.*?\])', i).group(1, 2))
        if re.search('\d+ -> \d+', i):
            lj.append(re.search('(\d+) -> (\d+)', i).group(1, 2))

    root = dict(jd)
    node_info = dict(node_info)

    node = []
    dlj = []
    for i in lj:
        if int(i[0]) in node:
            dlj.append((int(i[0]), int(i[1]), True))
        else:
            dlj.append((int(i[0]), int(i[1]), False))
            node.append(int(i[0]))

    all_writed = []
    for i in X:
        if clf.apply([i])[0] in all_writed:
            continue
        else:
            all_writed.append(clf.apply([i])[0])
        pytnon = ''
        tab = '\t'
        if clf.predict_proba([i])[0][1] > min_pro:
            pytnon += '\t#节点信息%s\n' % (node_info[str(clf.apply([i])[0])])
            last_node = 0
            for inx, node in enumerate(clf.decision_path([i]).toarray()[0]):
                if node and inx:
                    for i in dlj:
                        if i[0] == last_node and i[1] == inx:
                            if i[2]:
                                pytnon += tab
                                s = root[str(i[0])].split(' &le; ')[0]
                                pytnon += "if '%s' in s:\n" % (s)
                                tab += '\t'
                            else:
                                pytnon += tab
                                s = root[str(i[0])].split(' &le; ')[0]
                                pytnon += "if '%s' not in s:\n" % (s)
                                tab += '\t'
                    last_node = inx

            pytnon += tab
            pytnon += "return 1\n"
            PYTNON += pytnon

    PYTNON += '\treturn 0\n'
    score = clf.feature_importances_
    a = score
    word = vectorizer.get_feature_names()
    x = heapq.nlargest(n, range(len(a)), a.take)
    res = []
    for w in x:
        res.append([word[w], a[w]])
    return PYTNON, res
Esempio n. 18
0
    def Decison(self, q1, q2, q3, q4, q5, Prezzo):
        print(F"Choices: {q1} - {q2} - {q3} - {q4} - {q5}")
        DELETE = -1
        NOTDELETE = 9
        carsData = pd.read_csv("Dataset/cars.csv")
        # print(carsData.info())
        # print(carsData.head())

        # Eliminaimo le colonne non utli dal dataset
        carsData = carsData.drop('cilindri', axis=1)
        carsData = carsData.drop('larghezza', axis=1)
        carsData = carsData.drop('mpgcitta', axis=1)
        carsData = carsData.drop('symboling', axis=1)
        carsData = carsData.drop('aspirazione', axis=1)

        # Eliminiamo le vetture sopra la media per caratteristiche rilevanti e la ricalcoliamo
        print(F"Esempi di partenza (!Wne): {carsData.shape}")
        mediaAutostrada = carsData['mpgautostrada'].mean()
        carsData = carsData.drop(carsData[(carsData.mpgautostrada < 28)].index)

        minimoCilindrata = 87
        carsData = carsData.drop(carsData[(carsData.cilindrata < minimoCilindrata)].index)

        mediaAutostrada = carsData['mpgautostrada'].mean()
        mediaCavalli = carsData['cavalli'].mean()
        mediaCilindrata = carsData['cilindrata'].mean()
        mediaPeso = carsData['peso'].mean()
        mediaLunghezza = carsData['lunghezza'].mean()
        mediaAltezza = carsData['altezza'].mean()

        # Setto i paramentri impostati nell Gui ed effettuo Why not encoding dove necessario
        if q2 == "Three":
            porte = 2
            MINLUNGHEZZA = 170
            carsData = carsData.drop(carsData[(carsData.lunghezza > MINLUNGHEZZA)].index)
        else:
            porte = 4

        if q3 == True:
            family = NOTDELETE
            MINALTEZZA = 55.5
            MINLUNGHEZZA = 160
            carsData = carsData.drop(carsData[(carsData.lunghezza < MINLUNGHEZZA)].index)
            carsData = carsData.drop(carsData[(carsData.altezza < MINALTEZZA)].index)

            mediaLunghezza = carsData['lunghezza'].mean()
            mediaAltezza = carsData['altezza'].mean()

        else:
            family = DELETE
            carsData = carsData.drop(columns=['altezza', 'lunghezza'], axis=1)

        '''
        I.A. 
        Per lavorare su dataset della stessa dimensione x e y dobbiamo associre carsData a data_tree sopo 
        l'ultima possibile drop (modifica) al dataset originale. 
        Ha senso calcolare il prezzo medio solo sul dataset opportunamente calcolato. 
        '''
        if not isinstance(Prezzo, int):
            Prezzo = carsData['prezzo'].mean()
        data_tree = carsData

        if q4 == "ANT" and ('rwd' in data_tree['trazione'].values) and ('fwd' in data_tree['trazione'].values):
            trazione = [1, 0]
            data_tree = pd.get_dummies(data_tree, columns=["trazione"])
        elif q4 == "POST" and ('rwd' in data_tree['trazione'].values) and ('fwd' in data_tree['trazione'].values):
            trazione = [0, 1]
            data_tree = pd.get_dummies(data_tree, columns=["trazione"])
        else:
            trazione = DELETE
            data_tree = data_tree.drop('trazione', axis=1)

        if q5 == "BENZ" and ('gas' in data_tree['carburante'].values) and (
                'diesel' in data_tree['carburante'].values):

            carburante = [0, 1]
            data_tree = pd.get_dummies(data_tree, columns=["carburante"])

        elif q5 == "DIS" and ('gas' in data_tree['carburante'].values) and (
                'diesel' in data_tree['carburante'].values):

            carburante = [1, 0]
            data_tree = pd.get_dummies(data_tree, columns=["carburante"])
        else:
            carburante = DELETE
            data_tree = data_tree.drop('carburante', axis=1)

        print(F"Esempi elaborati (!Wne): {data_tree.shape}")

        # Costruisci l'albero decisionale
        from sklearn.tree import DecisionTreeClassifier
        from IPython.display import Image
        from sklearn.tree import export_graphviz
        import os
        from subprocess import call
        from matplotlib import pyplot as plt

        x = data_tree.drop(['marca'], axis=1)
        y = carsData['marca']

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

        print(F"Esempi su chi abbiamo fatto train e test: {x.shape}")

        tree = DecisionTreeClassifier(criterion="gini", max_depth=6)
        tree.fit(x_train, y_train)
        y_pred_train = tree.predict(x_train)
        y_pred = tree.predict(x_test)

        plt.scatter(y_train, y_pred_train)
        plt.xlabel("True value")
        plt.ylabel("Prediction")
        plt.show()

        accuracy_train = accuracy_score(y_train, y_pred_train)
        accuracy_test = accuracy_score(y_test, y_pred)

        '''
        Overfitting
        '''
        print("ACCURACY: TRAIN=%.4f TEST=%.4f" % (accuracy_train, accuracy_test))

        value_list = [porte]
        if family == NOTDELETE:
            value_list = value_list + [mediaLunghezza, mediaAltezza]

        value_list = value_list + [mediaPeso, mediaCilindrata,
                                   mediaCavalli, mediaAutostrada]

        value_list = value_list + [Prezzo]

        if trazione != DELETE:
            value_list = value_list + trazione
        if carburante != DELETE:
            value_list = value_list + carburante

        print(data_tree.columns.tolist())
        print(value_list)

        predizione = tree.predict([value_list])
        path = tree.decision_path([value_list])

        print(F"Predizione: {predizione[0]}")

        os.environ['PATH'] = os.environ['PATH'] + ';' + os.environ['CONDA_PREFIX'] + r"\Library\bin\graphviz"
        export_graphviz(tree, out_file="treetrip.dot", feature_names=None, rounded=True, precision=2,
                        filled=True, class_names=True)
        call(['dot', '-Tpng', 'treetrip.dot', '-o', 'treetrip.png'])
        Image(filename='treetrip.png')

        print(path)

        return predizione[0]
Esempio n. 19
0
def test_decision_path_hardcoded():
    X = iris.data
    y = iris.target
    est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y)
    node_indicator = est.decision_path(X[:2]).toarray()
    assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])
Esempio n. 20
0
def decisiontree(parsetrees, sent, urlprm):
    """Create a decision tree to select among n trees."""
    # The class labels are the n-best trees 0..n
    # The attributes are the labeled spans in the trees; they split the n-best
    # trees into two sets with and without that span.
    spans = {}
    if len(parsetrees) <= 1:
        return '', 0, None
    for n, (_prob, tree, _, _) in enumerate(parsetrees):
        for span in getspans(tree):
            # simplest strategy: store presence of span as binary feature
            # perhaps better: use weight from tree probability
            spans.setdefault(span, set()).add(n)

    # create decision tree with scikit-learn
    features = list(spans)
    featurenames = [
        '[%s %s]' % (label, ' '.join(sent[n] for n in leaves))
        for label, leaves in features
    ]
    data = np.array([[n in spans[span] for span in features]
                     for n in range(len(parsetrees))],
                    dtype=np.bool)
    estimator = DecisionTreeClassifier(random_state=0)
    estimator.fit(data,
                  range(len(parsetrees)),
                  sample_weight=[prob for prob, _, _, _ in parsetrees])
    path = estimator.decision_path(data)

    def rec(tree, n=0, depth=0):
        """Recursively produce a string representation of a decision tree."""
        if tree.children_left[n] == tree.children_right[n]:
            x = tree.value[n].nonzero()[1][0]
            prob, _tree, _treestr, _fragments = parsetrees[x]
            thistree = (
                '%(n)d. [%(prob)s] '
                '<a href="/annotate/accept?%(urlprm)s">accept this tree</a>; '
                '<a href="/annotate/edit?%(urlprm)s">edit</a>; '
                '<a href="/annotate/deriv?%(urlprm)s">derivation</a>\n\n' %
                dict(n=x + 1,
                     prob=probstr(prob),
                     urlprm=urlencode(dict(urlprm, n=x + 1, dec=depth))))
            return ('<span id="d%d" style="display: none; ">%stree %d:\n'
                    '%s</span>' % (n, depth * '\t', x + 1, thistree))
        left = tree.children_left[n]
        right = tree.children_right[n]
        return ('<span id=d%(n)d style="display: %(display)s; ">'
                '%(indent)s%(constituent)s '
                '<a href="javascript: showhide(\'d%(right)s\', \'d%(left)s\', '
                '\'dd%(exright)s\', \'%(numtrees)s\'); ">'
                'good constituent</a> '
                '<a href="javascript: showhide(\'d%(left)s\', \'d%(right)s\', '
                '\'dd%(exleft)s\', \'%(numtrees)s\'); ">'
                'bad constituent</a> '
                '%(subtree1)s%(subtree2)s</span>' % dict(
                    n=n,
                    display='block' if n == 0 else 'none',
                    indent=depth * 4 * ' ',
                    constituent=featurenames[tree.feature[n]],
                    left=left,
                    right=right,
                    exleft=path[:, left].nonzero()[0][0],
                    exright=path[:, right].nonzero()[0][0],
                    numtrees=len(parsetrees),
                    subtree1=rec(tree, left, depth + 1),
                    subtree2=rec(tree, right, depth + 1),
                ))

    nodes = rec(estimator.tree_)
    leaves = []
    seen = set()
    for n in range(estimator.tree_.node_count):
        x = estimator.tree_.value[n].nonzero()[1][0]
        if x in seen:
            continue
        seen.add(x)
        _prob, xtree, _treestr, _fragments = parsetrees[x]
        thistree = DrawTree(xtree, sent).text(unicodelines=True,
                                              html=True,
                                              funcsep='-',
                                              morphsep='/',
                                              nodeprops='t%d' % (x + 1))
        leaves.append('<span id="dd%d" style="display: none; ">%s</span>' %
                      (x, thistree))
    return nodes + ''.join(leaves), estimator.tree_.max_depth, path
Esempio n. 21
0
    def Decison(self, q1, q2, q3, q4, q5, Prezzo):
        print(F"Choices: {q1} - {q2} - {q3} - {q4} - {q5}")
        DELETE = -1
        Lunghezza = 160
        ConsumoCitta = 24
        carsData = pd.read_csv("Dataset/cars.csv")
        # print(carsData.info())
        # print(carsData.head())

        # Eliminaimo le colonne non utli dal dataset
        carsData = carsData.drop('cilindri', axis=1)
        carsData = carsData.drop('altezza', axis=1)
        carsData = carsData.drop('mpgautostrada', axis=1)
        carsData = carsData.drop('peso', axis=1)
        carsData = carsData.drop('symboling', axis=1)

        # Eliminiamo le vetture sopra la media per caratteristiche rilevanti e la ricalcoliamo
        print(F"Esempi di partenza (!Wne): {carsData.shape}")

        carsData = carsData.drop(
            carsData[(carsData.mpgcitta < ConsumoCitta)].index)
        if q3 == True:  # nonUtilitaria
            carsData = carsData.drop(
                carsData[(carsData.lunghezza < Lunghezza)].index)
        else:
            carsData = carsData.drop(
                carsData[(carsData.lunghezza > Lunghezza)].index)

        mediaCitta = carsData['mpgcitta'].mean()
        mediaLunghezza = carsData['lunghezza'].mean()
        mediaLarghezza = carsData['larghezza'].mean()
        mediaCilindrata = carsData['cilindrata'].mean()
        mediaCavalli = carsData['cavalli'].mean()

        if not isinstance(Prezzo, int):
            Prezzo = carsData['prezzo'].mean()

        data_tree = carsData

        # Setto i paramentri impostati nell Gui ed effettuo Why not encoding dove necessario
        if q2 == "Three":
            porte = 2
        else:
            porte = 4

        if q3 == True:
            AspirazioneMotore = [0, 1]
            data_tree = pd.get_dummies(data_tree,
                                       columns=["aspirazione"
                                                ])  # Why not encoding
        else:
            AspirazioneMotore = [1, 0]
            data_tree = pd.get_dummies(data_tree, columns=["aspirazione"])

        if q4 == "ANT" and ('rwd' in data_tree['trazione'].values) and (
                'fwd' in data_tree['trazione'].values):

            trazione = [1, 0]
            data_tree = pd.get_dummies(data_tree, columns=["trazione"])
        elif q4 == "POST" and ('rwd' in data_tree['trazione'].values) and (
                'fwd' in data_tree['trazione'].values):

            trazione = [0, 1]
            data_tree = pd.get_dummies(data_tree, columns=["trazione"])

        elif q3 == True:
            trazione = [1, 0]
            data_tree = pd.get_dummies(data_tree, columns=["trazione"])
        else:
            trazione = DELETE
            data_tree = data_tree.drop('trazione', axis=1)

        if q5 == "BENZ" and ('gas' in data_tree['carburante'].values) and (
                'diesel' in data_tree['carburante'].values):

            carburante = [0, 1]
            data_tree = pd.get_dummies(data_tree, columns=["carburante"])

        elif q5 == "DIS" and ('gas' in data_tree['carburante'].values) and (
                'diesel' in data_tree['carburante'].values):

            carburante = [1, 0]
            data_tree = pd.get_dummies(data_tree, columns=["carburante"])
        else:
            carburante = DELETE
            data_tree = data_tree.drop('carburante', axis=1)

        # Costruisci l'albero decisionale
        from sklearn.tree import DecisionTreeClassifier
        from IPython.display import Image
        from sklearn.tree import export_graphviz
        import os
        from subprocess import call
        from matplotlib import pyplot as plt

        x = data_tree.drop(['marca'], axis=1)
        y = carsData['marca']

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=0)

        print(F"Esempi su chi abbiamo fatto train e test: {x.shape}")

        tree = DecisionTreeClassifier(criterion="gini", max_depth=7)
        tree.fit(x_train, y_train)
        y_pred_train = tree.predict(x_train)
        y_pred = tree.predict(x_test)

        accuracy_train = accuracy_score(y_train, y_pred_train)
        accuracy_test = accuracy_score(y_test, y_pred)

        print("ACCURACY: TRAIN=%.4f TEST=%.4f" %
              (accuracy_train, accuracy_test))

        plt.scatter(y_train, y_pred_train)
        plt.xlabel("True value")
        plt.ylabel("Prediction")
        plt.show()

        value_list = [
            porte, mediaLunghezza, mediaLarghezza, mediaCilindrata,
            mediaCavalli, mediaCitta
        ]
        if trazione != DELETE:
            value_list = value_list + trazione
        if carburante != DELETE:
            value_list = value_list + carburante
        value_list = value_list + AspirazioneMotore
        value_list = value_list + [Prezzo]

        print(data_tree.columns.tolist())
        print(value_list)

        predizione = tree.predict([value_list])
        path = tree.decision_path([value_list])

        print(F"Predizione: {predizione[0]}")

        os.environ['PATH'] = os.environ['PATH'] + ';' + os.environ[
            'CONDA_PREFIX'] + r"\Library\bin\graphviz"
        export_graphviz(tree,
                        out_file="treecity.dot",
                        feature_names=None,
                        rounded=True,
                        precision=2,
                        filled=True,
                        class_names=True)
        call(['dot', '-Tpng', 'treecity.dot', '-o', 'treecity.png'])
        Image(filename='treecity.png')
        print(path)
        return predizione[0]
Esempio n. 22
0
                                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)

for node in graph.get_node_list():
    if node.get_attributes().get('label') is None:
        continue
    if 'samples = ' in node.get_attributes()['label']:
        labels = node.get_attributes()['label'].split('<br/>')
        for i, label in enumerate(labels):
            if label.startswith('samples = '):
                labels[i] = 'samples = 0'
        node.set('label', '<br/>'.join(labels))
        node.set_fillcolor('white')

samples = (x[:1])
decision_paths = my_tree.decision_path(samples)

for decision_path in decision_paths:
    for n, node_value in enumerate(decision_path.toarray()[0]):
        if node_value == 0:
            continue
        node = graph.get_node(str(n))[0]
        node.set_fillcolor('green')
        labels = node.get_attributes()['label'].split('<br/>')
        for i, label in enumerate(labels):
            if label.startswith('samples = '):
                labels[i] = 'samples = {}'.format(int(label.split('=')[1]) + 1)

        node.set('label', '<br/>'.join(labels))

filename = 'tree.png'
Esempio n. 23
0
    fpr, tpr, thresholds = roc_curve(y_test.T, y_hat.T)

    y_hat_int = np.rint(y_hat).astype(int)
    tree = DecisionTreeClassifier()

    tree.fit(X_test.T, y_hat_int.T)
    apl = average_path_length(tree, X_test.T)

    apl_count = 0
    apl_test = 0
    point_count = 0
    for i in range(X_test.shape[1]):
        test = tree.predict(X_test.T[i, :].reshape([1, 14]))
        print("test= " + str(test))
        dense_matrix = tree.decision_path(X_test.T[i, :].reshape(
            [1, 14])).todense()
        print(dense_matrix)
        if (test == 0):
            point_count += 1
            dense_matrix = tree.decision_path(X_test.T[i, :].reshape(
                [1, 14])).todense()
            print(dense_matrix)
            count = 0
            for k in range(dense_matrix.shape[1]):
                if (dense_matrix[0, k] == 1):
                    count += 1
            print(count)
            apl_count = apl_count + count

    average_apl = float(apl_count) / point_count
    print(float(apl_test) / point_count)
              "node %s." % (
                  node_depth[i] * "\t",
                  i,
                  children_left[i],
                  feature[i],
                  threshold[i],
                  children_right[i],
              ))
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] != node_id:
Esempio n. 25
0
class DTChoice:
    """Choice maker based on sklearn decision trees

    Parameters
    ----------

    train_set: A list of public databases

    mfs: A callable object for computing metafeatures on databases.  The
    returned metafeatures must be a dictionary object mapping metafeature names
    to their values. The mfs object must have a sensitivity attribute, with is a
    dictionary mapping metafeature names to their sensitivities.

    algs: A dictionary mapping names to algorithms. Each algorithm must implement
    a run method, which executes the algorithm on a database, and an error
    method, which computes the algorithm's error on a database.
    
    C: Value of C used to train the special regret-based decision tree

    trans: feature transformations to use. Best to be kept as 'default'
    """

    def __init__(self, train_set, mfs, algs, C=0, trans='default'):
        self.metafeatures = mfs
        self.algs = algs
        self.C = C
        usage = np.array(list(mfs.sensitivities.values()))
        usage[usage > 0] = 1
        self.is_used = usage
        
        if len(train_set) == 2:
            self.X = train_set[0]
            self.regrets = train_set[1]
        else:
            regrets = []
            X = []

            for t in tqdm(train_set):
                X.append(mfs(t))
                regrets.append({name: alg.error(t) for name, alg in
                    algs.items()})

            self.X = pd.DataFrame(X)
            self.regrets = pd.DataFrame(regrets)

        if trans == 'default':
            self.trans = MetaFeatureHelper.get_all_trans(self.X.shape[1])
        else:
            self.trans = np.identity(len(mfs))
        log_X = np.log(np.maximum(1e-8, self.X))
        self.T = pd.DataFrame([t(log_X) for t in
            self.trans]).reset_index(drop=True).T.reset_index(drop=True)

        self.y = self.regrets.idxmin(axis=1)
        self.model = DecisionTreeClassifier()
        self.retrain_model()
    
    @classmethod
    def from_dataframes(cls, mfs_array, regrets, mfs, algs, C=0, trans='default'):
        return cls((mfs_array, regrets), mfs, algs, C=C, trans=trans)

    #Change metafeatures
    def update_metas(self, train_set, mfs):
        self.X = pd.DataFrame([mfs(t) for t in train_set])
        self.T = pd.DataFrame([t(self.X) for t in
            self.trans]).reset_index(drop=True).T.reset_index(drop=True)
        usage = np.array(list(mfs.sensitivities.values()))
        usage[usage > 0] = 1
        self.is_used = usage
        self.retrain_model()

    #Helper method
    def retrain_model(self):
        self.model.fit(self.T, self.y, self.regrets, self.C)

    #Return the label of the best algorithm.
    def get_best_alg(self, data, budget):
        sens = self.metafeatures.sensitivities
        nnz = np.count_nonzero(self.is_used)
        feature_budget = budget / nnz
        X = self.metafeatures(data)
        #noisy_X = pd.DataFrame([{name: value + np.random.laplace(0, sens[name]/
        #                             feature_budget)
        #                         for name, value in self.metafeatures(data).items()}])
        noisy_X = pd.DataFrame(self.metafeatures(data), index=[0]) \
                + pd.DataFrame(self.metafeatures.sensitivities, index=[0]) \
                              .apply(lambda x: np.random.laplace(0,
                                  x/feature_budget))
        log_noisy_X = np.log(np.maximum(1e-8, noisy_X))
        noisy_T = pd.DataFrame([t(log_noisy_X) for t in
            self.trans]).reset_index(drop=True).T

        X = noisy_T
        S = self.X.shape[1]
        used = np.zeros(S)
        node_counts = self.model.decision_path(X).data-1 
        U = np.unique(node_counts[:-1])
        used[U[U < S]] = 1
        U = U[U >= S] - S
        used += np.any([self.trans[i].coefs for i in U], axis=0)
        used = used > 0
        nfeature_used = self.is_used.dot(used)
        alg = self.model.predict(X)[0]
        return alg, nfeature_used * feature_budget

    #Choose and run the best algorithm in a DP way
    def choose(self, data, ratio=0.3):
        budget = data.epsilon*ratio
        tot_eps = data.epsilon
        data.epsilon -= budget
        (best, used) = self.get_best_alg(data, budget)
        data.epsilon = tot_eps - used
        return self.algs[best]

    def get_errors(self, data, ratio=0.3):
        #data = copy.copy(data)
        budget = data.epsilon*ratio
        errors = pd.DataFrame([{name: alg.error(data)
                                for name, alg in self.algs.items()}])

        (best, used) = self.get_best_alg(data, budget)
        best_alg = self.algs[best]
        data.epsilon = data.epsilon - used
        R = best_alg.error(data)
        errors['cm'] = R
        return errors 

    def get_approximate_regret(self, return_std=False, test_ratio=0.3):
        """
        Splits data into training and test and returns average regrets on the
        test split for each algorithm and for this DTChoice object.

        The DTChoice regret is approximate (and an underestimate) for two 
        reasons. Let A = ratio*epsilon and B = (1-ratio)*epsilon.

        First, we don't add Laplace(A) noise to the metafeatures when we 
        predict on them.

        Second, the algorithm we choose isn't run with B budget---it's run with
        epsilon budget instead.

        """
        X_train, X_test, y_train, y_test = train_test_split(self.X,
                self.y, test_size=test_ratio)
        model = DecisionTreeClassifier()
        model.fit(X_train, y_train.idxmin(axis=1))
        algs = model.predict(X_test)
        perfs = y_test.lookup(y_test.index, algs)
        R = np.concatenate((np.array(y_test), perfs[:, None]), axis=1)
        R = R - np.min(R, axis=1)[:, None]
        if(return_std):
            return (R.mean(axis=0), R.std(axis=0))
        else:
            return R.mean(axis=0)

    def print_tree(self, of=None):
        dot_data = export_graphviz(self.model, out_file=of, filled=True,
                rounded=True)
        graph = graphviz.Source(dot_data)
        return graph

    def print_arith_coef(self, idx):
        coefs = self.trans[idx].coefs
        L = list(self.metafeatures.sensitivities.keys())
        top = []
        bot = []
        for i in range(len(L)):
            if coefs[i] == 1:
                top.append(L[i])
            elif coefs[i] == -1:
                bot.append(L[i])
        return '*'.join(top) + ' / ' + '*'.join(bot)
Esempio n. 26
0
#               "node %s."
#               % (node_depth[i] * "\t",
#                  i,
#                  children_left[i],
#                  attribute_dict[str(feature[i])],
#                  threshold[i],
#                  children_right[i],
#                  ))
# print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_validation)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_validation)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] != node_id:
Esempio n. 27
0
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sb

# 1. 데이터 셋
iris_data = load_iris()

# 2. 데이터 분리(학습 데이터와 예측 데이터)
x_train, x_test, y_train, y_test = train_test_split(iris_data.data,
                                                    iris_data.target,
                                                    test_size=0.2)

# 3. 모델 생성 => decision tree
model = DecisionTreeClassifier(random_state=42)

# 4. 데이터 학습: clf.fit(2차원, 1차원))
model.fit(x_train, y_train)

# DecisionTreeClassifier 클래스 정보
# print(help(DecisionTreeClassifier))
# DecisionTreeClassifier의 Hyper Parameter
# max_depth 트리 최대 깊이, max_features 최대 feature 수, min_sample_split 노드를 분할하기 위한 최소의 샘플 수
print(np.round(model.feature_importances_, 2))
# [0.01 0.05 0.55 0.4] = [sepal length, sepal width, petal length, petal width]

# 추가 메서드
print(model.get_depth())
print(model.get_n_leaves())
print(model.get_params())
print(model.decision_path(x_train))
print(model.score(x_test, y_test))  # accuracy_score와 같은 기능
Esempio n. 28
0
    #   - threshold, threshold value at the node
    #

    # Using those arrays, we can parse the tree structure:
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold

    # First let's retrieve the decision path of each sample. The decision_path
    # method allows to retrieve the node indicator functions. A non zero element of
    # indicator matrix at the position (i, j) indicates that the sample i goes
    # through the node j.

    node_indicator = clf.decision_path(X_train)
    # 获取训练集规则路径
    d_paths = node_indicator.todense()
    # 规则路径去重并统计对应路径的出现次数
    d_uniques, d_idxs, d_counts, = np.unique(d_paths,
                                             axis=0,
                                             return_counts=True,
                                             return_index=True)
    # 规则打印
    print('\nThe most precise rules are the following:')
    # i = 0
    # for rule in d_uniques:
    for i, item in enumerate(d_uniques):
        count_max_idx = np.argmax(d_counts)
        rule = d_uniques[count_max_idx]  # 获取通过次数最多的路径
        print("\nRules_{0}, passed counts:{1}".format(i, d_counts.max()))
                  node_depth[i] * "\t",
                  i,
                  children_left[i],
                  feature[i],
                  threshold[i],
                  children_right[i],
              ))
print()
"""
# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.
"""

node_indicator = tree.decision_path(X)

# Similarly, we can also have the leaves ids reached by each sample.
leave_id = tree.apply(X)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

#sample_id = 0
#node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
#                                    node_indicator.indptr[sample_id + 1]]
#
#print('Rules used to predict sample %s: ' % sample_id)
#for node_id in node_index:
#    if leave_id[sample_id] == node_id:
#        continue
Esempio n. 30
0
    def _prune_reduced_error(
            cls,
            model: DecisionTreeClassifier,
            X: numpy.array,
            y: numpy.array,
            step_score_drop: float = 0,
            max_score_drop: float = 0) -> DecisionTreeClassifier:
        def _prune_tree(tree, node_to_prune):
            child_left = tree.children_left[node_to_prune]
            child_right = tree.children_right[node_to_prune]
            tree.children_left[child_left] = Tree.TREE_UNDEFINED
            tree.children_left[child_right] = Tree.TREE_UNDEFINED
            tree.children_right[child_left] = Tree.TREE_UNDEFINED
            tree.children_right[child_right] = Tree.TREE_UNDEFINED
            tree.children_left[node_to_prune] = Tree.TREE_LEAF
            tree.children_right[node_to_prune] = Tree.TREE_LEAF
            tree.feature[node_to_prune] = Tree.TREE_UNDEFINED

        model = deepcopy(model)
        tree = model.tree_
        changes = True
        checked = set()
        parents = {
            x: i
            for i, x in enumerate(tree.children_left) if x != Tree.TREE_LEAF
        }
        parents.update({
            x: i
            for i, x in enumerate(tree.children_right) if x != Tree.TREE_LEAF
        })
        leaves = list(numpy.where(tree.children_left == Tree.TREE_LEAF)[0])
        decision_path = {
            leaf: d.nonzero()[1]
            for leaf, d in zip(leaves,
                               model.decision_path(X).T[leaves])
        }
        y_predicted = model.predict(X)
        init_score = current_score = accuracy_score(y, y_predicted)
        while changes:
            changes = False
            for leaf_index, leaf1 in enumerate(leaves):
                if leaf1 not in parents:
                    continue
                parent = parents[leaf1]
                if parent in checked:
                    continue
                leaf2 = tree.children_right[parent]
                leaf2 = leaf2 if leaf2 != leaf1 else tree.children_left[parent]
                if tree.children_left[leaf2] != Tree.TREE_LEAF or \
                        tree.children_right[leaf2] != Tree.TREE_LEAF:
                    continue

                data_leaf1_index = decision_path[leaf1]
                data_leaf2_index = decision_path[leaf2]
                data_parent_index = numpy.concatenate(
                    (data_leaf1_index, data_leaf2_index))
                y_predicted_leaf1 = model.classes_[numpy.argmax(
                    tree.value[leaf1, 0, :])]
                y_predicted_leaf2 = model.classes_[numpy.argmax(
                    tree.value[leaf2, 0, :])]
                new_y = model.classes_[numpy.argmax(tree.value[parent, 0, :])]

                score_delta = (numpy.sum(new_y == y[data_parent_index]) -
                               numpy.sum(y_predicted_leaf1 == y[data_leaf1_index]) -
                               numpy.sum(y_predicted_leaf2 == y[data_leaf2_index])) \
                    / X.shape[0]

                if init_score != 0 and score_delta / init_score < max_score_drop or \
                        current_score != 0 and score_delta / current_score < step_score_drop:
                    checked.add(parent)
                    continue
                else:
                    current_score += score_delta
                    leaves.remove(leaf2)
                    leaves[leaf_index] = parent
                    _prune_tree(tree, parent)
                    y_predicted[data_parent_index] = new_y
                    del decision_path[leaf1], decision_path[leaf2]
                    decision_path[parent] = data_parent_index
                    changes = True
                    break
        return model
Esempio n. 31
0
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    depth[node_id] = parent_depth + 1

    # If we have a test node
    if (child_left[node_id] != child_right[node_id]):
        stack.append((child_left[node_id], parent_depth + 1))
        stack.append((child_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print(" \nThe binary tree structure has %s nodes" % n_nodes)

node_indicator = classifier.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = classifier.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]

#print(test_accuracy)
max_test_accuracy_pruning = 0
depth = classifier.tree_.max_depth
Esempio n. 32
0
def test_decision_path_hardcoded():
    X = iris.data
    y = iris.target
    est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y)
    node_indicator = est.decision_path(X[:2]).toarray()
    assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])