class TreeKnnClassifier(abstract_classifier): def __init__(self, criterion, depth, k, samples, labels): self.tree = DecisionTreeClassifier(criterion=criterion, max_depth=depth) self.tree.fit(X=samples, y=labels) factory = knn_factory(k) train_sets = self.getLeafIDs(samples, labels) self.knn_classifiers = { leaf: factory.train(train_sets[leaf]['samples'], train_sets[leaf]['labels'], train_sets[leaf]['weights']) for leaf in train_sets.keys() } def getLeafIDs(self, data, labels): train_sets = {} for sample, label in zip(data, labels): leaf = self.getLeaf(sample) try: train_sets[leaf]['samples'].append(sample) train_sets[leaf]['labels'].append(label) except KeyError: train_sets[leaf] = { 'samples': [sample], 'labels': [label], 'weights': self.generateWeights(sample) } return train_sets def getLeaf(self, sample): return self.tree.decision_path(sample.reshape(1, -1)).indices[-1] def generateWeights(self, sample): weights = np.ones(sample.shape) for node in self.tree.decision_path(sample.reshape(1, -1)).indices[:-1]: weights[self.tree.tree_.feature[node]] = 0 return weights def classify(self, sample): """ Finds relevant knn classifier, and returns it's result on the given sample. :param sample: :return: """ return self.knn_classifiers[self.getLeaf(sample)].classify(sample)
class DecisionTreeModel: # initialize a DecisionTreeModel object with "model" attribute containing an actual DecisionTreeClassifier object from the skLearn module def __init__(self,*args,**kwargs): self.model = DecisionTreeClassifier(*args, **kwargs) def get_model(self): return self.model def apply(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.apply(X,check_input) def cost_complexity_pruning_path(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) return self.model.cost_complexity_pruning_path(X,y,sample_weight) def decision_path(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.decision_path(X,check_input) def fit(self,X,y,sample_weight=None,check_input=True,X_idx_sorted=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) self.model.fit(X,y,sample_weight,check_input,X_idx_sorted) return self def predict(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict(X,check_input) def predict_log_proba(self,X): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict_log_proba(X) def predict_proba(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict_proba(X,check_input) def score(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) return self.model.score(X,y,sample_weight) def __getattribute__(self,item): try: return super().__getattribute__(item) except: pass; return getattr(self.model,item)
def get_decision_paths(model: tree.DecisionTreeClassifier, data, selection): selected_rows = data.loc[selection.astype(bool), :] d_path = model.decision_path(selected_rows) paths = set() leaf_id = model.apply(selected_rows) feature = model.tree_.feature threshold = model.tree_.threshold for sample_id in range(len(selected_rows.index)): node_idx = d_path.indices[d_path.indptr[sample_id]:d_path. indptr[sample_id + 1]] rules = [] for node_id in node_idx: if leaf_id[sample_id] == node_id: continue sign = None if selected_rows.iloc[sample_id, feature[node_id]] <= threshold[node_id]: sign = " <= " else: sign = " >= " rule = (data.columns[feature[node_id]] + sign + str(round(threshold[node_id], 2))) rules.append(rule) paths.add(tuple(rules)) paths = [[rule for rule in path] for path in paths] return paths
def test_decisiontree_classifier_decision_path_leaf(self): model = DecisionTreeClassifier(max_depth=2) X, y = make_classification(10, n_features=4, random_state=42) X = X[:, :2] model.fit(X, y) initial_types = [('input', FloatTensorType((None, X.shape[1])))] model_onnx = convert_sklearn(model, initial_types=initial_types, options={ id(model): { 'decision_leaf': True, 'decision_path': True, 'zipmap': False } }, target_opset=TARGET_OPSET) sess = InferenceSession(model_onnx.SerializeToString()) res = sess.run(None, {'input': X.astype(np.float32)}) pred = model.predict(X) assert_almost_equal(pred, res[0].ravel()) prob = model.predict_proba(X) assert_almost_equal(prob, res[1]) dec = model.decision_path(X) exp_path = binary_array_to_string(dec.todense()) exp_leaf = path_to_leaf(model.tree_, dec.todense()) assert exp_path == res[2].ravel().tolist() assert exp_leaf.tolist() == res[3].ravel().tolist()
def decision_plot(new_X_train2, new_y_train2, feature_names, test, model, classify): dt = DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=1) dt.fit(new_X_train2, new_y_train2) if classify == 'rf': print("Decision Tree Predicts for Instance:" + str(dt.predict(test)) + " and Random Forests predicted:" + str(model.predict(test))) elif classify == 'xg': print("Decision Tree Predicts for Instance:" + str(dt.predict(test)) + " and XGboost predicted:" + str(model.predict(test))) fidelityPreds = dt.predict(new_X_train2) print("Let's see fidelity", accuracy_score(new_y_train2, fidelityPreds)) graph = Source( export_graphviz(dt, out_file=None, feature_names=feature_names, class_names=dt.classes_, filled=True)) display(SVG(graph.pipe(format='svg'))) print("Lets find out the path for this specific instance!") for i in dt.decision_path(test): print(i) return dt
def demoOne(): dataSet, labels = getDataSet() clf = DecisionTreeClassifier(random_state=0) clf.fit(dataSet, labels) treePlot(clf) print(clf.tree_.max_depth) print(clf.decision_path([[0, 0]])) print(clf.get_params()) print(clf.predict_proba([[0, 0]]))
def main() -> None: iris = load_iris() clf = DecisionTreeClassifier(random_state=0) clf.fit(iris.data, iris.target) # sepal length, sepal width, petal length, petal width X = np.array([[5.0, 2.9, 1.0, 4.85]]) #X = np.array([[5.0, 2.9, 1.0, 0.2]]) print(clf.predict(X)) print(clf.decision_path(X)) print(clf.decision_path(X).todense()) with open('iris-dtree.dot', mode='w') as f: export_graphviz(clf, out_file=f, rounded=True, feature_names=iris.feature_names, class_names=iris.target_names, special_characters=True)
def getPath(X, input, conf, model): """ Get the path from Local Interpretable Model-agnostic Explanation Tree :param X: the whole inputs :param sess: TF session :param x: input placeholder :param preds: the model's symbolic output :param input: instance to interpret :param conf: the configuration of dataset :return: the path for the decision of given instance """ # use the original implementation of LIME explainer = lime_tabular.LimeTabularExplainer( X, feature_names=conf.feature_name, class_names=conf.class_name, categorical_features=conf.categorical_features, discretize_continuous=True) g_data = explainer.generate_instance(input, num_samples=5000) #print(g_data.shape) #g_labels = model_argmax(sess, x, preds, g_data) g_labels = model.predict(g_data) ''' with open('CexSet.csv', 'a', newline='') as csvfile: writer = cv.writer(csvfile) writer.writerows(g_data) ''' # build the interpretable tree tree = DecisionTreeClassifier( random_state=2019) #min_samples_split=0.05, min_samples_leaf =0.01 tree.fit(g_data, g_labels) # get the path for decision path_index = tree.decision_path(np.array([input])).indices path = [] for i in range(len(path_index)): node = path_index[i] i = i + 1 f = tree.tree_.feature[node] if f != -2: left_count = tree.tree_.n_node_samples[ tree.tree_.children_left[node]] right_count = tree.tree_.n_node_samples[ tree.tree_.children_right[node]] left_confidence = 1.0 * left_count / (left_count + right_count) right_confidence = 1.0 - left_confidence if tree.tree_.children_left[node] == path_index[i]: path.append( [f, "<=", tree.tree_.threshold[node], left_confidence]) else: path.append( [f, ">", tree.tree_.threshold[node], right_confidence]) return path
def test_decisiontreeclassifier_decision_path(self): model = DecisionTreeClassifier(max_depth=2) X, y = make_classification(10, n_features=4, random_state=42) X = X[:, :2].astype(numpy.float32) model.fit(X, y) model_onnx = to_onnx( model, X, options={id(model): { 'decision_path': True, 'zipmap': False }}) sess = OnnxInference(model_onnx) res = sess.run({'X': X}) pred = model.predict(X) self.assertEqualArray(pred, res['label'].ravel()) prob = model.predict_proba(X) self.assertEqualArray(prob, res['probabilities']) dec = model.decision_path(X) exp = binary_array_to_string(dec.todense()) self.assertEqual(exp, res['decision_path'].ravel().tolist())
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin): """ A transformer that generalizes data to representative points. Learns data generalizations based on an original model's predictions and a target accuracy. Once the generalizations are learned, can receive one or more data records and transform them to representative points based on the learned generalization. An alternative way to use the transformer is to supply ``cells`` and ``features`` in init or set_params and those will be used to transform data to representatives. In this case, fit must still be called but there is no need to supply it with ``X`` and ``y``, and there is no need to supply an existing ``estimator`` to init. In summary, either ``estimator`` and ``target_accuracy`` should be supplied or ``cells`` and ``features`` should be supplied. Parameters ---------- estimator : estimator, optional The original model for which generalization is being performed. Should be pre-fitted. target_accuracy : float, optional The required accuracy when applying the base model to the generalized data. Accuracy is measured relative to the original accuracy of the model. features : list of str, optional The feature names, in the order that they appear in the data. cells : list of object, optional The cells used to generalize records. Each cell must define a range or subset of categories for each feature, as well as a representative value for each feature. This parameter should be used when instantiating a transformer object without first fitting it. Attributes ---------- cells_ : list of object The cells used to generalize records, as learned when calling fit. ncp_ : float The NCP (information loss) score of the resulting generalization, as measured on the training data. generalizations_ : object The generalizations that were learned (actual feature ranges). Notes ----- """ def __init__(self, estimator=None, target_accuracy=0.998, features=None, cells=None): self.estimator = estimator self.target_accuracy = target_accuracy self.features = features self.cells = cells def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ ret = {} ret['target_accuracy'] = self.target_accuracy if deep: ret['features'] = copy.deepcopy(self.features) ret['cells'] = copy.deepcopy(self.cells) ret['estimator'] = self.estimator else: ret['features'] = copy.copy(self.features) ret['cells'] = copy.copy(self.cells) return ret def set_params(self, **params): """Set the parameters of this estimator. Returns ------- self : object Returns self. """ if 'target_accuracy' in params: self.target_accuracy = params['target_accuracy'] if 'features' in params: self.features = params['features'] if 'cells' in params: self.cells = params['cells'] return self def fit_transform(self, X=None, y=None): """Learns the generalizations based on training data, and applies them to the data. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features), optional The training input samples. y : array-like, shape (n_samples,), optional The target values. An array of int. This should contain the predictions of the original model on ``X``. Returns ------- self : object Returns self. """ self.fit(X, y) return self.transform(X) def fit(self, X=None, y=None): """Learns the generalizations based on training data. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features), optional The training input samples. y : array-like, shape (n_samples,), optional The target values. An array of int. This should contain the predictions of the original model on ``X``. Returns ------- X_transformed : ndarray, shape (n_samples, n_features) The array containing the representative values to which each record in ``X`` is mapped. """ # take into account that estimator, X, y, cells, features may be None if X is not None and y is not None: X, y = check_X_y(X, y, accept_sparse=True) self.n_features_ = X.shape[1] elif self.features: self.n_features_ = len(self.features) else: self.n_features_ = 0 if self.features: self._features = self.features # if features is None, use numbers instead of names elif self.n_features_ != 0: self._features = [i for i in range(self.n_features_)] else: self._features = None if self.cells: self.cells_ = self.cells else: self.cells_ = {} # Going to fit # (currently not dealing with option to fit with only X and y and no estimator) if self.estimator and X is not None and y is not None: # divide dataset into train and test X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=0.4, random_state=18) # collect feature data (such as min, max) train_data = pd.DataFrame(X_train, columns=self._features) feature_data = {} for feature in self._features: if not feature in feature_data.keys(): values = list(train_data.loc[:, feature]) fd = {} fd['min'] = min(values) fd['max'] = max(values) feature_data[feature] = fd self.cells_ = {} self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) self.dt_.fit(X_train, y_train) self._calculate_cells() self._modify_cells() nodes = self._get_nodes_level(0) self._attach_cells_representatives(X_train, y_train, nodes) # self.cells_ currently holds the generalization created from the tree leaves # apply generalizations to test data generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) # check accuracy accuracy = self.estimator.score(generalized, y_test) print('Initial accuracy is %f' % accuracy) # if accuracy above threshold, improve generalization if accuracy > self.target_accuracy: level = 1 while accuracy > self.target_accuracy: nodes = self._get_nodes_level(level) self._calculate_level_cells(level) self._attach_cells_representatives(X_train, y_train, nodes) generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) accuracy = self.estimator.score(generalized, y_test) print('Level: %d, accuracy: %f' % (level, accuracy)) level += 1 # if accuracy below threshold, improve accuracy by removing features from generalization if accuracy < self.target_accuracy: while accuracy < self.target_accuracy: self._calculate_generalizations() removed_feature = self._remove_feature_from_generalization( X_test, nodes, y_test, feature_data) if not removed_feature: break generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) accuracy = self.estimator.score(generalized, y_test) print('Removed feature: %s, accuracy: %f' % (removed_feature, accuracy)) # self.cells_ currently holds the chosen generalization based on target accuracy # calculate iLoss self.ncp_ = self._calculate_ncp(X_test, self.generalizations_, feature_data) # Return the transformer return self def transform(self, X): """ Transforms data records to representative points. Parameters ---------- X : {array-like, sparse-matrix}, shape (n_samples, n_features) The input samples. Returns ------- X_transformed : ndarray, shape (n_samples, n_features) The array containing the representative values to which each record in ``X`` is mapped. """ # Check if fit has been called msg = 'This %(name)s instance is not initialized yet. ' \ 'Call ‘fit’ or ‘set_params’ with ' \ 'appropriate arguments before using this method.' check_is_fitted(self, ['cells', 'features'], msg=msg) # Input validation X = check_array(X, accept_sparse=True) if X.shape[1] != self.n_features_ and self.n_features_ != 0: raise ValueError('Shape of input is different from what was seen' 'in `fit`') if not self._features: self._features = [i for i in range(X.shape[1])] representatives = pd.DataFrame(columns=self._features) # only columns generalized = pd.DataFrame(X, columns=self._features, copy=True) # original data mapped = np.zeros(X.shape[0]) # to mark records we already mapped # iterate over cells (leaves in decision tree) for i in range(len(self.cells_)): # Copy the representatives from the cells into another data structure: # iterate over features in test data for feature in self._features: # if feature has a representative value in the cell and should not # be left untouched, take the representative value if feature in self.cells_[i]['representative'] and \ ( 'untouched' not in self.cells_[i] \ or feature not in self.cells_[i]['untouched'] ): representatives.loc[ i, feature] = self.cells_[i]['representative'][feature] # else, drop the feature (removes from representatives columns that # do not have a representative value or should remain untouched) elif feature in representatives.columns.tolist(): representatives = representatives.drop(feature, axis=1) # get the indexes of all records that map to this cell indexes = self._get_record_indexes_for_cell( X, self.cells_[i], mapped) # replace the values in the representative columns with the representative # values (leaves others untouched) if not representatives.columns.empty: generalized.loc[ indexes, representatives.columns] = representatives.loc[i].values return generalized.to_numpy() def _get_record_indexes_for_cell(self, X, cell, mapped): return [ i for i, x in enumerate(X) if not mapped.item(i) and self._cell_contains(cell, x, i, mapped) ] def _cell_contains(self, cell, x, i, mapped): for f in self._features: if f in cell['ranges']: if not self._cell_contains_numeric(f, cell['ranges'][f], x): return False else: #TODO: exception - feature not defined pass # Mark as mapped mapped.itemset(i, 1) return True def _cell_contains_numeric(self, f, range, x): i = self._features.index(f) # convert x to ndarray to allow indexing a = np.array(x) value = a.item(i) if range['start']: if value <= range['start']: return False if range['end']: if value > range['end']: return False return True def _calculate_cells(self): self.cells_by_id_ = {} self.cells_ = self._calculate_cells_recursive(0) def _calculate_cells_recursive(self, node): feature_index = self.dt_.tree_.feature[node] if feature_index == -2: # this is a leaf label = self._calculate_cell_label(node) hist = [int(i) for i in self.dt_.tree_.value[node][0]] cell = { 'label': label, 'hist': hist, 'ranges': {}, 'id': int(node) } return [cell] cells = [] feature = self._features[feature_index] threshold = self.dt_.tree_.threshold[node] left_child = self.dt_.tree_.children_left[node] right_child = self.dt_.tree_.children_right[node] left_child_cells = self._calculate_cells_recursive(left_child) for cell in left_child_cells: if feature not in cell['ranges'].keys(): cell['ranges'][feature] = {'start': None, 'end': None} if cell['ranges'][feature]['end'] is None: cell['ranges'][feature]['end'] = threshold cells.append(cell) self.cells_by_id_[cell['id']] = cell right_child_cells = self._calculate_cells_recursive(right_child) for cell in right_child_cells: if feature not in cell['ranges'].keys(): cell['ranges'][feature] = {'start': None, 'end': None} if cell['ranges'][feature]['start'] is None: cell['ranges'][feature]['start'] = threshold cells.append(cell) self.cells_by_id_[cell['id']] = cell return cells def _calculate_cell_label(self, node): label_hist = self.dt_.tree_.value[node][0] return int(self.dt_.classes_[np.argmax(label_hist)]) def _modify_cells(self): cells = [] for cell in self.cells_: new_cell = { 'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'], 'representative': None } for feature in self._features: if feature in cell['ranges'].keys(): new_cell['ranges'][feature] = cell['ranges'][feature] else: new_cell['ranges'][feature] = {'start': None, 'end': None} cells.append(new_cell) self.cells_by_id_[new_cell['id']] = new_cell self.cells_ = cells def _calculate_level_cells(self, level): if level < 0 or level > self.dt_.get_depth(): #TODO: exception 'Illegal level %d' % level pass if level > 0: new_cells = [] new_cells_by_id = {} nodes = self._get_nodes_level(level) for node in nodes: if self.dt_.tree_.feature[node] == -2: # leaf node new_cell = self.cells_by_id_[node] else: left_child = self.dt_.tree_.children_left[node] right_child = self.dt_.tree_.children_right[node] left_cell = self.cells_by_id_[left_child] right_cell = self.cells_by_id_[right_child] new_cell = { 'id': int(node), 'ranges': {}, 'categories': {}, 'label': None, 'representative': None } for feature in left_cell['ranges'].keys(): new_cell['ranges'][feature] = {} new_cell['ranges'][feature]['start'] = left_cell[ 'ranges'][feature]['start'] new_cell['ranges'][feature]['end'] = right_cell[ 'ranges'][feature]['start'] for feature in left_cell['categories'].keys(): new_cell['categories'][feature] = \ list(set(left_cell['categories'][feature]) | set(right_cell['categories'][feature])) self._calculate_level_cell_label(left_cell, right_cell, new_cell) new_cells.append(new_cell) new_cells_by_id[new_cell['id']] = new_cell self.cells_ = new_cells self.cells_by_id_ = new_cells_by_id # else: nothing to do, stay with previous cells def _calculate_level_cell_label(self, left_cell, right_cell, new_cell): new_cell['hist'] = [ x + y for x, y in zip(left_cell['hist'], right_cell['hist']) ] new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) def _get_nodes_level(self, level): # level = distance from lowest leaf node_depth = np.zeros(shape=self.dt_.tree_.node_count, dtype=np.int64) is_leaves = np.zeros(shape=self.dt_.tree_.node_count, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 if self.dt_.tree_.children_left[ node_id] != self.dt_.tree_.children_right[node_id]: stack.append( (self.dt_.tree_.children_left[node_id], parent_depth + 1)) stack.append( (self.dt_.tree_.children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True max_depth = max(node_depth) depth = max_depth - level if depth < 0: return None return [ i for i, x in enumerate(node_depth) if x == depth or (x < depth and is_leaves[i]) ] def _attach_cells_representatives(self, samples, labels, level_nodes): samples_df = pd.DataFrame(samples, columns=self._features) labels_df = pd.DataFrame(labels, columns=['label']) samples_node_ids = self._find_sample_nodes(samples_df, level_nodes) for cell in self.cells_: cell['representative'] = {} # get all rows in cell indexes = [ i for i, x in enumerate(samples_node_ids) if x == cell['id'] ] sample_rows = samples_df.iloc[indexes] sample_labels = labels_df.iloc[indexes]['label'].values.tolist() # get rows with matching label indexes = [ i for i, label in enumerate(sample_labels) if label == cell['label'] ] match_samples = sample_rows.iloc[indexes] # find the "middle" of the cluster array = match_samples.values median = np.median(array, axis=0) # find the record closest to the median i = 0 min = len(array) min_dist = float("inf") for row in array: dist = distance.euclidean(row, median) if dist < min_dist: min_dist = dist min = i i = i + 1 row = match_samples.iloc[min] # use its values as the representative for feature in cell['ranges'].keys(): cell['representative'][feature] = row[feature].item() def _find_sample_nodes(self, samples, nodes): paths = self.dt_.decision_path(samples).toarray() nodeSet = set(nodes) return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths] def _generalize(self, data, level_nodes, cells, cells_by_id): representatives = pd.DataFrame( columns=self._features) # empty except for columns generalized = pd.DataFrame(data, columns=self._features, copy=True) # original data mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id) # iterate over cells (leaves in decision tree) for i in range(len(cells)): # This code just copies the representatives from the cells into another data structure # iterate over features for feature in self._features: # if feature has a representative value in the cell and should not be left untouched, # take the representative value if feature in cells[i]['representative'] and ( 'untouched' not in cells[i] or feature not in cells[i]['untouched']): representatives.loc[ i, feature] = cells[i]['representative'][feature] # else, drop the feature (removes from representatives columns that do not have a # representative value or should remain untouched) elif feature in representatives.columns.tolist(): representatives = representatives.drop(feature, axis=1) # get the indexes of all records that map to this cell indexes = [ j for j in range(len(mapping_to_cells)) if mapping_to_cells[j]['id'] == cells[i]['id'] ] # replaces the values in the representative columns with the representative values # (leaves others untouched) if not representatives.columns.empty: generalized.loc[ indexes, representatives.columns] = representatives.loc[i].values return generalized.to_numpy() def _map_to_cells(self, samples, nodes, cells_by_id): mapping_to_cells = [] for index, row in samples.iterrows(): cell = self._find_sample_cells([row], nodes, cells_by_id)[0] mapping_to_cells.append(cell) return mapping_to_cells def _find_sample_cells(self, samples, nodes, cells_by_id): node_ids = self._find_sample_nodes(samples, nodes) return [cells_by_id[nodeId] for nodeId in node_ids] def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data): feature = self._get_feature_to_remove(samples, nodes, labels, feature_data) if not feature: return None GeneralizeToRepresentative._remove_feature_from_cells( self.cells_, self.cells_by_id_, feature) return feature def _get_feature_to_remove(self, samples, nodes, labels, feature_data): # We want to remove features with low iLoss (NCP) and high accuracy gain # (after removing them) ranges = self.generalizations_['ranges'] range_counts = self._find_range_count(samples, ranges) total = samples.size range_min = sys.float_info.max remove_feature = None for feature in ranges.keys(): if feature not in self.generalizations_['untouched']: feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], feature_data[feature], total) if feature_ncp > 0: # divide by accuracy gain new_cells = copy.deepcopy(self.cells_) cells_by_id = copy.deepcopy(self.cells_by_id_) GeneralizeToRepresentative._remove_feature_from_cells( new_cells, cells_by_id, feature) generalized = self._generalize(samples, nodes, new_cells, cells_by_id) accuracy = self.estimator.score(generalized, labels) feature_ncp = feature_ncp / accuracy if feature_ncp < range_min: range_min = feature_ncp remove_feature = feature print('feature to remove: ' + (remove_feature if remove_feature else '')) return remove_feature def _calculate_generalizations(self): self.generalizations_ = { 'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells_), 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_) } def _find_range_count(self, samples, ranges): samples_df = pd.DataFrame(samples, columns=self._features) range_counts = {} last_value = None for r in ranges.keys(): range_counts[r] = [] # if empty list, all samples should be counted if not ranges[r]: range_counts[r].append(samples_df.shape[0]) else: for value in ranges[r]: range_counts[r].append( len(samples_df.loc[samples_df[r] <= value])) last_value = value range_counts[r].append( len(samples_df.loc[samples_df[r] > last_value])) return range_counts def _calculate_ncp(self, samples, generalizations, feature_data): # supressed features are already taken care of within _calc_ncp_numeric ranges = generalizations['ranges'] range_counts = self._find_range_count(samples, ranges) total = samples.shape[0] total_ncp = 0 total_features = len(generalizations['untouched']) for feature in ranges.keys(): feature_ncp = GeneralizeToRepresentative._calc_ncp_numeric( ranges[feature], range_counts[feature], feature_data[feature], total) total_ncp = total_ncp + feature_ncp total_features += 1 if total_features == 0: return 0 return total_ncp / total_features @staticmethod def _calculate_ranges(cells): ranges = {} for cell in cells: for feature in [ key for key in cell['ranges'].keys() if 'untouched' not in cell or key not in cell['untouched'] ]: if feature not in ranges.keys(): ranges[feature] = [] if cell['ranges'][feature]['start'] is not None: ranges[feature].append(cell['ranges'][feature]['start']) if cell['ranges'][feature]['end'] is not None: ranges[feature].append(cell['ranges'][feature]['end']) for feature in ranges.keys(): ranges[feature] = list(set(ranges[feature])) ranges[feature].sort() return ranges @staticmethod def _calculate_untouched(cells): untouched_lists = [ cell['untouched'] if 'untouched' in cell else [] for cell in cells ] untouched = set(untouched_lists[0]) untouched = untouched.intersection(*untouched_lists) return list(untouched) @staticmethod def _calc_ncp_numeric(feature_range, range_count, feature_data, total): # if there are no ranges, feature is supressed and iLoss is 1 if not feature_range: return 1 # range only contains the split values, need to add min and max value of feature # to enable computing sizes of all ranges new_range = [feature_data['min'] ] + feature_range + [feature_data['max']] range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])] normalized_range_sizes = [ s * n / total for s, n in zip(range_sizes, range_count) ] average_range_size = sum(normalized_range_sizes) / len( normalized_range_sizes) return average_range_size / (feature_data['max'] - feature_data['min']) @staticmethod def _remove_feature_from_cells(cells, cells_by_id, feature): for cell in cells: if 'untouched' not in cell: cell['untouched'] = [] if feature in cell['ranges'].keys(): del cell['ranges'][feature] else: del cell['categories'][feature] cell['untouched'].append(feature) cells_by_id[cell['id']] = cell.copy()
# ============================== # out put # =============================== # attributes print(dt_gini.classes_) print(dt_gini.n_classes_) print(dt_gini.n_features_) print(dt_gini.n_outputs_) print(dt_gini.feature_importances_) print(dt_gini.max_features_) print(dt_gini.tree_) # methods preds = dt_gini.predict(X_test) preds_proba = dt_gini.predict_proba(X_test) preds_log_proba = dt_gini.predict_log_proba(X_test) print(preds) print(preds_proba) print(preds_log_proba) accuracy = dt_gini.score(X_test, y_test) print(accuracy) print(dt_gini.decision_path(X_train)) print(dt_gini.decision_path(X_test))
for i in range(n_nodes): if is_leaves[i]: print("%snode=%s leaf node." % (node_depth[i] * "\t", i)) else: print( "%snode=%s test node: go to node %s if X[:, %s] <= %ss else to " "node %s." % (node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i]) ) print() # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. node_indicator = estimator.decision_path(X_test) # Similarly, we can also have the leaves ids reached by each sample. leave_id = estimator.apply(X_test) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]] print("Rules used to predict sample %s: " % sample_id) for node_id in node_index: if leave_id[sample_id] != node_id: continue
def main(): estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0) estimator.fit(X_train, y_train) # Properties from the estimator n_nodes = estimator.tree_.node_count # including decision nodes children_left = estimator.tree_.children_left # id of the left child of the node # id of the right child of the node children_right = estimator.tree_.children_right feature = estimator.tree_.feature threshold = estimator.tree_.threshold # traverse the tree node_depth = np.zeros(n_nodes, dtype=np.int64) is_leaves = np.zeros(n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True print("The binary tree structure has {} nodes and " "has the following tree structure:".format(n_nodes)) for i in range(n_nodes): if is_leaves[i]: print("{}node={} leaf node.".format(node_depth[i] * "\t", i)) else: print( "{}node={} test node: go to node {} if X[:, {}] <= {:.2f} else to " "node {}.".format( node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i], )) node_indicator = estimator.decision_path(X_test) # Similarly, we can also have the leaves ids reached by each sample. leave_id = estimator.apply(X_test) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[ node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]] # Each row holds the prediction value of each node output_value = estimator.tree_.value print(output_value) print('Rules used to predict sample {} with feature: {}: '.format( sample_id, ', '.join([str(feature) for feature in X_test[0]]))) for node_id in node_index: if leave_id[sample_id] == node_id: continue threshold_sign = ">" if (X_test[sample_id, feature[node_id]] <= threshold[node_id]): threshold_sign = "<=" print( "decision id node {} : (X_test[{}, {}] = {:.2f} {} {:.2f})".format( node_id, sample_id, feature[node_id], X_test[sample_id, feature[node_id]], threshold_sign, threshold[node_id])) # Plotting graph dot = export_graphviz(estimator, feature_names=feature_names, class_names=class_names, rounded=True, proportion=False, precision=2, filled=True) graph = Source(dot) graph.view() print('debug line')
max_features='log2', max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=1000, splitter='best') # DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5) clf = clf.fit(X, Y) # nt y_pred = clf.predict(test) print('Prediction : ', y_pred) print('Actual value : ', y_true) print('Testing Validation Accuracy : ', accuracy_score(y_true, y_pred)) print('Decision Path:', clf.decision_path(X, check_input=True)) dotfile = open("dtree2.dot", 'w') tree.export_graphviz(clf, out_file=dotfile) dotfile.close() # Dump the trained decision tree classifier with Pickle model_name = 'dtc.pkl' # Open the file to save as pkl file model_name_pkl = open(model_name, 'wb') pickle.dump(clf, model_name_pkl) # Close the pickle instances model_name_pkl.close() elif (args["algotype"] == "dtr"): clf = tree.DecisionTreeRegressor() clf = clf.fit(X, Y) y_pred = clf.predict(test) print('Prediction : ', y_pred)
a = X_test[:1] print("a:") print(a) #print('sk_pred: {}'.format(clf.predict(a))) #print('true: {}'.format(y_test[:3])) # shows the end point of the tree traverse by a sample print("Returns the index of the leaf that each sample is predicted as:") index_of_leaf = clf.apply(a) print(index_of_leaf) #decision path shows the nodes of the tree that were traverse by the sample. print("decision path:") d_path = clf.decision_path(a) print(d_path) print("nodes in the decision path:") n_d_path = np.unique(np.sort(d_path.indices)) print(n_d_path) print("probability of each class:") print(clf.predict_proba(a)) print("Feature importances:") feature_importances = clf.feature_importances_ print(feature_importances) #accuracy -number of instance correctly classified acsc = accuracy_score(y_test, y_pred)
# We can also retrieve the decision path of samples of interest. The # ``decision_path`` method outputs an indicator matrix that allows us to # retrieve the nodes the samples of interest traverse through. A non zero # element in the indicator matrix at position ``(i, j)`` indicates that # the sample ``i`` goes through the node ``j``. Or, for one sample ``i``, the # positions of the non zero elements in row ``i`` of the indicator matrix # designate the ids of the nodes that sample goes through. # # The leaf ids reached by samples of interest can be obtained with the # ``apply`` method. This returns an array of the node ids of the leaves # reached by each sample of interest. Using the leaf ids and the # ``decision_path`` we can obtain the splitting conditions that were used to # predict a sample or a group of samples. First, let's do it for one sample. # Note that ``node_index`` is a sparse matrix. node_indicator = clf.decision_path(X_test) leaf_id = clf.apply(X_test) sample_id = 0 # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id` node_index = node_indicator.indices[ node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1] ] print("Rules used to predict sample {id}:\n".format(id=sample_id)) for node_id in node_index: # continue to the next node if it is a leaf node if leaf_id[sample_id] == node_id: continue # check if value of the split feature for sample 0 is below threshold
def get_python(X, y, cut=jieba.cut, n=100, min_pro=0.75, func_name='function', max_depth=5, min_samples_leaf=50, max_leaf_nodes=20): ''' :param X: 训练文本 :param y: 训练标签 :param cut: 分词器 :param n: 返回关键词个数 :param min_pro: 写入代码的最小概率 :param func_name: 代码函数名称 :param max_depth: 决策树参数->深度 :param min_samples_leaf: 决策树参数->节点最少样本 :param max_leaf_nodes: 决策树参数->最多叶子数 :return: ''' X = [get_word(i, cut) for i in X] vectorizer = CountVectorizer(max_features=3000) # onehot的编码 X = vectorizer.fit_transform(X).toarray() print('分词完毕,现在开始计算相关性...') PYTNON = '''def %s(s:str): ''' % (func_name) clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) clf.fit(X, y) print('训练完毕,现在开始寻找规则...') dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=vectorizer.get_feature_names(), filled=True, rounded=True, special_characters=True) tree_info = dot_data.getvalue() jd = [] lj = [] node_info = [] for i in tree_info.split('\n'): if re.search('\d+ \[label=<.*?>', i): jd.append(re.search('(\d+) \[label=<(.*?)<', i).group(1, 2)) if re.search('\d+.*?value = ', i): node_info.append( re.search('(\d+).*?value = (\[.*?\])', i).group(1, 2)) if re.search('\d+ -> \d+', i): lj.append(re.search('(\d+) -> (\d+)', i).group(1, 2)) root = dict(jd) node_info = dict(node_info) node = [] dlj = [] for i in lj: if int(i[0]) in node: dlj.append((int(i[0]), int(i[1]), True)) else: dlj.append((int(i[0]), int(i[1]), False)) node.append(int(i[0])) all_writed = [] for i in X: if clf.apply([i])[0] in all_writed: continue else: all_writed.append(clf.apply([i])[0]) pytnon = '' tab = '\t' if clf.predict_proba([i])[0][1] > min_pro: pytnon += '\t#节点信息%s\n' % (node_info[str(clf.apply([i])[0])]) last_node = 0 for inx, node in enumerate(clf.decision_path([i]).toarray()[0]): if node and inx: for i in dlj: if i[0] == last_node and i[1] == inx: if i[2]: pytnon += tab s = root[str(i[0])].split(' ≤ ')[0] pytnon += "if '%s' in s:\n" % (s) tab += '\t' else: pytnon += tab s = root[str(i[0])].split(' ≤ ')[0] pytnon += "if '%s' not in s:\n" % (s) tab += '\t' last_node = inx pytnon += tab pytnon += "return 1\n" PYTNON += pytnon PYTNON += '\treturn 0\n' score = clf.feature_importances_ a = score word = vectorizer.get_feature_names() x = heapq.nlargest(n, range(len(a)), a.take) res = [] for w in x: res.append([word[w], a[w]]) return PYTNON, res
def Decison(self, q1, q2, q3, q4, q5, Prezzo): print(F"Choices: {q1} - {q2} - {q3} - {q4} - {q5}") DELETE = -1 NOTDELETE = 9 carsData = pd.read_csv("Dataset/cars.csv") # print(carsData.info()) # print(carsData.head()) # Eliminaimo le colonne non utli dal dataset carsData = carsData.drop('cilindri', axis=1) carsData = carsData.drop('larghezza', axis=1) carsData = carsData.drop('mpgcitta', axis=1) carsData = carsData.drop('symboling', axis=1) carsData = carsData.drop('aspirazione', axis=1) # Eliminiamo le vetture sopra la media per caratteristiche rilevanti e la ricalcoliamo print(F"Esempi di partenza (!Wne): {carsData.shape}") mediaAutostrada = carsData['mpgautostrada'].mean() carsData = carsData.drop(carsData[(carsData.mpgautostrada < 28)].index) minimoCilindrata = 87 carsData = carsData.drop(carsData[(carsData.cilindrata < minimoCilindrata)].index) mediaAutostrada = carsData['mpgautostrada'].mean() mediaCavalli = carsData['cavalli'].mean() mediaCilindrata = carsData['cilindrata'].mean() mediaPeso = carsData['peso'].mean() mediaLunghezza = carsData['lunghezza'].mean() mediaAltezza = carsData['altezza'].mean() # Setto i paramentri impostati nell Gui ed effettuo Why not encoding dove necessario if q2 == "Three": porte = 2 MINLUNGHEZZA = 170 carsData = carsData.drop(carsData[(carsData.lunghezza > MINLUNGHEZZA)].index) else: porte = 4 if q3 == True: family = NOTDELETE MINALTEZZA = 55.5 MINLUNGHEZZA = 160 carsData = carsData.drop(carsData[(carsData.lunghezza < MINLUNGHEZZA)].index) carsData = carsData.drop(carsData[(carsData.altezza < MINALTEZZA)].index) mediaLunghezza = carsData['lunghezza'].mean() mediaAltezza = carsData['altezza'].mean() else: family = DELETE carsData = carsData.drop(columns=['altezza', 'lunghezza'], axis=1) ''' I.A. Per lavorare su dataset della stessa dimensione x e y dobbiamo associre carsData a data_tree sopo l'ultima possibile drop (modifica) al dataset originale. Ha senso calcolare il prezzo medio solo sul dataset opportunamente calcolato. ''' if not isinstance(Prezzo, int): Prezzo = carsData['prezzo'].mean() data_tree = carsData if q4 == "ANT" and ('rwd' in data_tree['trazione'].values) and ('fwd' in data_tree['trazione'].values): trazione = [1, 0] data_tree = pd.get_dummies(data_tree, columns=["trazione"]) elif q4 == "POST" and ('rwd' in data_tree['trazione'].values) and ('fwd' in data_tree['trazione'].values): trazione = [0, 1] data_tree = pd.get_dummies(data_tree, columns=["trazione"]) else: trazione = DELETE data_tree = data_tree.drop('trazione', axis=1) if q5 == "BENZ" and ('gas' in data_tree['carburante'].values) and ( 'diesel' in data_tree['carburante'].values): carburante = [0, 1] data_tree = pd.get_dummies(data_tree, columns=["carburante"]) elif q5 == "DIS" and ('gas' in data_tree['carburante'].values) and ( 'diesel' in data_tree['carburante'].values): carburante = [1, 0] data_tree = pd.get_dummies(data_tree, columns=["carburante"]) else: carburante = DELETE data_tree = data_tree.drop('carburante', axis=1) print(F"Esempi elaborati (!Wne): {data_tree.shape}") # Costruisci l'albero decisionale from sklearn.tree import DecisionTreeClassifier from IPython.display import Image from sklearn.tree import export_graphviz import os from subprocess import call from matplotlib import pyplot as plt x = data_tree.drop(['marca'], axis=1) y = carsData['marca'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) print(F"Esempi su chi abbiamo fatto train e test: {x.shape}") tree = DecisionTreeClassifier(criterion="gini", max_depth=6) tree.fit(x_train, y_train) y_pred_train = tree.predict(x_train) y_pred = tree.predict(x_test) plt.scatter(y_train, y_pred_train) plt.xlabel("True value") plt.ylabel("Prediction") plt.show() accuracy_train = accuracy_score(y_train, y_pred_train) accuracy_test = accuracy_score(y_test, y_pred) ''' Overfitting ''' print("ACCURACY: TRAIN=%.4f TEST=%.4f" % (accuracy_train, accuracy_test)) value_list = [porte] if family == NOTDELETE: value_list = value_list + [mediaLunghezza, mediaAltezza] value_list = value_list + [mediaPeso, mediaCilindrata, mediaCavalli, mediaAutostrada] value_list = value_list + [Prezzo] if trazione != DELETE: value_list = value_list + trazione if carburante != DELETE: value_list = value_list + carburante print(data_tree.columns.tolist()) print(value_list) predizione = tree.predict([value_list]) path = tree.decision_path([value_list]) print(F"Predizione: {predizione[0]}") os.environ['PATH'] = os.environ['PATH'] + ';' + os.environ['CONDA_PREFIX'] + r"\Library\bin\graphviz" export_graphviz(tree, out_file="treetrip.dot", feature_names=None, rounded=True, precision=2, filled=True, class_names=True) call(['dot', '-Tpng', 'treetrip.dot', '-o', 'treetrip.png']) Image(filename='treetrip.png') print(path) return predizione[0]
def test_decision_path_hardcoded(): X = iris.data y = iris.target est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y) node_indicator = est.decision_path(X[:2]).toarray() assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])
def decisiontree(parsetrees, sent, urlprm): """Create a decision tree to select among n trees.""" # The class labels are the n-best trees 0..n # The attributes are the labeled spans in the trees; they split the n-best # trees into two sets with and without that span. spans = {} if len(parsetrees) <= 1: return '', 0, None for n, (_prob, tree, _, _) in enumerate(parsetrees): for span in getspans(tree): # simplest strategy: store presence of span as binary feature # perhaps better: use weight from tree probability spans.setdefault(span, set()).add(n) # create decision tree with scikit-learn features = list(spans) featurenames = [ '[%s %s]' % (label, ' '.join(sent[n] for n in leaves)) for label, leaves in features ] data = np.array([[n in spans[span] for span in features] for n in range(len(parsetrees))], dtype=np.bool) estimator = DecisionTreeClassifier(random_state=0) estimator.fit(data, range(len(parsetrees)), sample_weight=[prob for prob, _, _, _ in parsetrees]) path = estimator.decision_path(data) def rec(tree, n=0, depth=0): """Recursively produce a string representation of a decision tree.""" if tree.children_left[n] == tree.children_right[n]: x = tree.value[n].nonzero()[1][0] prob, _tree, _treestr, _fragments = parsetrees[x] thistree = ( '%(n)d. [%(prob)s] ' '<a href="/annotate/accept?%(urlprm)s">accept this tree</a>; ' '<a href="/annotate/edit?%(urlprm)s">edit</a>; ' '<a href="/annotate/deriv?%(urlprm)s">derivation</a>\n\n' % dict(n=x + 1, prob=probstr(prob), urlprm=urlencode(dict(urlprm, n=x + 1, dec=depth)))) return ('<span id="d%d" style="display: none; ">%stree %d:\n' '%s</span>' % (n, depth * '\t', x + 1, thistree)) left = tree.children_left[n] right = tree.children_right[n] return ('<span id=d%(n)d style="display: %(display)s; ">' '%(indent)s%(constituent)s ' '<a href="javascript: showhide(\'d%(right)s\', \'d%(left)s\', ' '\'dd%(exright)s\', \'%(numtrees)s\'); ">' 'good constituent</a> ' '<a href="javascript: showhide(\'d%(left)s\', \'d%(right)s\', ' '\'dd%(exleft)s\', \'%(numtrees)s\'); ">' 'bad constituent</a> ' '%(subtree1)s%(subtree2)s</span>' % dict( n=n, display='block' if n == 0 else 'none', indent=depth * 4 * ' ', constituent=featurenames[tree.feature[n]], left=left, right=right, exleft=path[:, left].nonzero()[0][0], exright=path[:, right].nonzero()[0][0], numtrees=len(parsetrees), subtree1=rec(tree, left, depth + 1), subtree2=rec(tree, right, depth + 1), )) nodes = rec(estimator.tree_) leaves = [] seen = set() for n in range(estimator.tree_.node_count): x = estimator.tree_.value[n].nonzero()[1][0] if x in seen: continue seen.add(x) _prob, xtree, _treestr, _fragments = parsetrees[x] thistree = DrawTree(xtree, sent).text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t%d' % (x + 1)) leaves.append('<span id="dd%d" style="display: none; ">%s</span>' % (x, thistree)) return nodes + ''.join(leaves), estimator.tree_.max_depth, path
def Decison(self, q1, q2, q3, q4, q5, Prezzo): print(F"Choices: {q1} - {q2} - {q3} - {q4} - {q5}") DELETE = -1 Lunghezza = 160 ConsumoCitta = 24 carsData = pd.read_csv("Dataset/cars.csv") # print(carsData.info()) # print(carsData.head()) # Eliminaimo le colonne non utli dal dataset carsData = carsData.drop('cilindri', axis=1) carsData = carsData.drop('altezza', axis=1) carsData = carsData.drop('mpgautostrada', axis=1) carsData = carsData.drop('peso', axis=1) carsData = carsData.drop('symboling', axis=1) # Eliminiamo le vetture sopra la media per caratteristiche rilevanti e la ricalcoliamo print(F"Esempi di partenza (!Wne): {carsData.shape}") carsData = carsData.drop( carsData[(carsData.mpgcitta < ConsumoCitta)].index) if q3 == True: # nonUtilitaria carsData = carsData.drop( carsData[(carsData.lunghezza < Lunghezza)].index) else: carsData = carsData.drop( carsData[(carsData.lunghezza > Lunghezza)].index) mediaCitta = carsData['mpgcitta'].mean() mediaLunghezza = carsData['lunghezza'].mean() mediaLarghezza = carsData['larghezza'].mean() mediaCilindrata = carsData['cilindrata'].mean() mediaCavalli = carsData['cavalli'].mean() if not isinstance(Prezzo, int): Prezzo = carsData['prezzo'].mean() data_tree = carsData # Setto i paramentri impostati nell Gui ed effettuo Why not encoding dove necessario if q2 == "Three": porte = 2 else: porte = 4 if q3 == True: AspirazioneMotore = [0, 1] data_tree = pd.get_dummies(data_tree, columns=["aspirazione" ]) # Why not encoding else: AspirazioneMotore = [1, 0] data_tree = pd.get_dummies(data_tree, columns=["aspirazione"]) if q4 == "ANT" and ('rwd' in data_tree['trazione'].values) and ( 'fwd' in data_tree['trazione'].values): trazione = [1, 0] data_tree = pd.get_dummies(data_tree, columns=["trazione"]) elif q4 == "POST" and ('rwd' in data_tree['trazione'].values) and ( 'fwd' in data_tree['trazione'].values): trazione = [0, 1] data_tree = pd.get_dummies(data_tree, columns=["trazione"]) elif q3 == True: trazione = [1, 0] data_tree = pd.get_dummies(data_tree, columns=["trazione"]) else: trazione = DELETE data_tree = data_tree.drop('trazione', axis=1) if q5 == "BENZ" and ('gas' in data_tree['carburante'].values) and ( 'diesel' in data_tree['carburante'].values): carburante = [0, 1] data_tree = pd.get_dummies(data_tree, columns=["carburante"]) elif q5 == "DIS" and ('gas' in data_tree['carburante'].values) and ( 'diesel' in data_tree['carburante'].values): carburante = [1, 0] data_tree = pd.get_dummies(data_tree, columns=["carburante"]) else: carburante = DELETE data_tree = data_tree.drop('carburante', axis=1) # Costruisci l'albero decisionale from sklearn.tree import DecisionTreeClassifier from IPython.display import Image from sklearn.tree import export_graphviz import os from subprocess import call from matplotlib import pyplot as plt x = data_tree.drop(['marca'], axis=1) y = carsData['marca'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) print(F"Esempi su chi abbiamo fatto train e test: {x.shape}") tree = DecisionTreeClassifier(criterion="gini", max_depth=7) tree.fit(x_train, y_train) y_pred_train = tree.predict(x_train) y_pred = tree.predict(x_test) accuracy_train = accuracy_score(y_train, y_pred_train) accuracy_test = accuracy_score(y_test, y_pred) print("ACCURACY: TRAIN=%.4f TEST=%.4f" % (accuracy_train, accuracy_test)) plt.scatter(y_train, y_pred_train) plt.xlabel("True value") plt.ylabel("Prediction") plt.show() value_list = [ porte, mediaLunghezza, mediaLarghezza, mediaCilindrata, mediaCavalli, mediaCitta ] if trazione != DELETE: value_list = value_list + trazione if carburante != DELETE: value_list = value_list + carburante value_list = value_list + AspirazioneMotore value_list = value_list + [Prezzo] print(data_tree.columns.tolist()) print(value_list) predizione = tree.predict([value_list]) path = tree.decision_path([value_list]) print(F"Predizione: {predizione[0]}") os.environ['PATH'] = os.environ['PATH'] + ';' + os.environ[ 'CONDA_PREFIX'] + r"\Library\bin\graphviz" export_graphviz(tree, out_file="treecity.dot", feature_names=None, rounded=True, precision=2, filled=True, class_names=True) call(['dot', '-Tpng', 'treecity.dot', '-o', 'treecity.png']) Image(filename='treecity.png') print(path) return predizione[0]
special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) for node in graph.get_node_list(): if node.get_attributes().get('label') is None: continue if 'samples = ' in node.get_attributes()['label']: labels = node.get_attributes()['label'].split('<br/>') for i, label in enumerate(labels): if label.startswith('samples = '): labels[i] = 'samples = 0' node.set('label', '<br/>'.join(labels)) node.set_fillcolor('white') samples = (x[:1]) decision_paths = my_tree.decision_path(samples) for decision_path in decision_paths: for n, node_value in enumerate(decision_path.toarray()[0]): if node_value == 0: continue node = graph.get_node(str(n))[0] node.set_fillcolor('green') labels = node.get_attributes()['label'].split('<br/>') for i, label in enumerate(labels): if label.startswith('samples = '): labels[i] = 'samples = {}'.format(int(label.split('=')[1]) + 1) node.set('label', '<br/>'.join(labels)) filename = 'tree.png'
fpr, tpr, thresholds = roc_curve(y_test.T, y_hat.T) y_hat_int = np.rint(y_hat).astype(int) tree = DecisionTreeClassifier() tree.fit(X_test.T, y_hat_int.T) apl = average_path_length(tree, X_test.T) apl_count = 0 apl_test = 0 point_count = 0 for i in range(X_test.shape[1]): test = tree.predict(X_test.T[i, :].reshape([1, 14])) print("test= " + str(test)) dense_matrix = tree.decision_path(X_test.T[i, :].reshape( [1, 14])).todense() print(dense_matrix) if (test == 0): point_count += 1 dense_matrix = tree.decision_path(X_test.T[i, :].reshape( [1, 14])).todense() print(dense_matrix) count = 0 for k in range(dense_matrix.shape[1]): if (dense_matrix[0, k] == 1): count += 1 print(count) apl_count = apl_count + count average_apl = float(apl_count) / point_count print(float(apl_test) / point_count)
"node %s." % ( node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i], )) print() # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. node_indicator = estimator.decision_path(X_test) # Similarly, we can also have the leaves ids reached by each sample. leave_id = estimator.apply(X_test) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[ node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]] print('Rules used to predict sample %s: ' % sample_id) for node_id in node_index: if leave_id[sample_id] != node_id:
class DTChoice: """Choice maker based on sklearn decision trees Parameters ---------- train_set: A list of public databases mfs: A callable object for computing metafeatures on databases. The returned metafeatures must be a dictionary object mapping metafeature names to their values. The mfs object must have a sensitivity attribute, with is a dictionary mapping metafeature names to their sensitivities. algs: A dictionary mapping names to algorithms. Each algorithm must implement a run method, which executes the algorithm on a database, and an error method, which computes the algorithm's error on a database. C: Value of C used to train the special regret-based decision tree trans: feature transformations to use. Best to be kept as 'default' """ def __init__(self, train_set, mfs, algs, C=0, trans='default'): self.metafeatures = mfs self.algs = algs self.C = C usage = np.array(list(mfs.sensitivities.values())) usage[usage > 0] = 1 self.is_used = usage if len(train_set) == 2: self.X = train_set[0] self.regrets = train_set[1] else: regrets = [] X = [] for t in tqdm(train_set): X.append(mfs(t)) regrets.append({name: alg.error(t) for name, alg in algs.items()}) self.X = pd.DataFrame(X) self.regrets = pd.DataFrame(regrets) if trans == 'default': self.trans = MetaFeatureHelper.get_all_trans(self.X.shape[1]) else: self.trans = np.identity(len(mfs)) log_X = np.log(np.maximum(1e-8, self.X)) self.T = pd.DataFrame([t(log_X) for t in self.trans]).reset_index(drop=True).T.reset_index(drop=True) self.y = self.regrets.idxmin(axis=1) self.model = DecisionTreeClassifier() self.retrain_model() @classmethod def from_dataframes(cls, mfs_array, regrets, mfs, algs, C=0, trans='default'): return cls((mfs_array, regrets), mfs, algs, C=C, trans=trans) #Change metafeatures def update_metas(self, train_set, mfs): self.X = pd.DataFrame([mfs(t) for t in train_set]) self.T = pd.DataFrame([t(self.X) for t in self.trans]).reset_index(drop=True).T.reset_index(drop=True) usage = np.array(list(mfs.sensitivities.values())) usage[usage > 0] = 1 self.is_used = usage self.retrain_model() #Helper method def retrain_model(self): self.model.fit(self.T, self.y, self.regrets, self.C) #Return the label of the best algorithm. def get_best_alg(self, data, budget): sens = self.metafeatures.sensitivities nnz = np.count_nonzero(self.is_used) feature_budget = budget / nnz X = self.metafeatures(data) #noisy_X = pd.DataFrame([{name: value + np.random.laplace(0, sens[name]/ # feature_budget) # for name, value in self.metafeatures(data).items()}]) noisy_X = pd.DataFrame(self.metafeatures(data), index=[0]) \ + pd.DataFrame(self.metafeatures.sensitivities, index=[0]) \ .apply(lambda x: np.random.laplace(0, x/feature_budget)) log_noisy_X = np.log(np.maximum(1e-8, noisy_X)) noisy_T = pd.DataFrame([t(log_noisy_X) for t in self.trans]).reset_index(drop=True).T X = noisy_T S = self.X.shape[1] used = np.zeros(S) node_counts = self.model.decision_path(X).data-1 U = np.unique(node_counts[:-1]) used[U[U < S]] = 1 U = U[U >= S] - S used += np.any([self.trans[i].coefs for i in U], axis=0) used = used > 0 nfeature_used = self.is_used.dot(used) alg = self.model.predict(X)[0] return alg, nfeature_used * feature_budget #Choose and run the best algorithm in a DP way def choose(self, data, ratio=0.3): budget = data.epsilon*ratio tot_eps = data.epsilon data.epsilon -= budget (best, used) = self.get_best_alg(data, budget) data.epsilon = tot_eps - used return self.algs[best] def get_errors(self, data, ratio=0.3): #data = copy.copy(data) budget = data.epsilon*ratio errors = pd.DataFrame([{name: alg.error(data) for name, alg in self.algs.items()}]) (best, used) = self.get_best_alg(data, budget) best_alg = self.algs[best] data.epsilon = data.epsilon - used R = best_alg.error(data) errors['cm'] = R return errors def get_approximate_regret(self, return_std=False, test_ratio=0.3): """ Splits data into training and test and returns average regrets on the test split for each algorithm and for this DTChoice object. The DTChoice regret is approximate (and an underestimate) for two reasons. Let A = ratio*epsilon and B = (1-ratio)*epsilon. First, we don't add Laplace(A) noise to the metafeatures when we predict on them. Second, the algorithm we choose isn't run with B budget---it's run with epsilon budget instead. """ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_ratio) model = DecisionTreeClassifier() model.fit(X_train, y_train.idxmin(axis=1)) algs = model.predict(X_test) perfs = y_test.lookup(y_test.index, algs) R = np.concatenate((np.array(y_test), perfs[:, None]), axis=1) R = R - np.min(R, axis=1)[:, None] if(return_std): return (R.mean(axis=0), R.std(axis=0)) else: return R.mean(axis=0) def print_tree(self, of=None): dot_data = export_graphviz(self.model, out_file=of, filled=True, rounded=True) graph = graphviz.Source(dot_data) return graph def print_arith_coef(self, idx): coefs = self.trans[idx].coefs L = list(self.metafeatures.sensitivities.keys()) top = [] bot = [] for i in range(len(L)): if coefs[i] == 1: top.append(L[i]) elif coefs[i] == -1: bot.append(L[i]) return '*'.join(top) + ' / ' + '*'.join(bot)
# "node %s." # % (node_depth[i] * "\t", # i, # children_left[i], # attribute_dict[str(feature[i])], # threshold[i], # children_right[i], # )) # print() # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. node_indicator = estimator.decision_path(X_validation) # Similarly, we can also have the leaves ids reached by each sample. leave_id = estimator.apply(X_validation) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[ node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]] print('Rules used to predict sample %s: ' % sample_id) for node_id in node_index: if leave_id[sample_id] != node_id:
from sklearn import tree import matplotlib.pyplot as plt import seaborn as sb # 1. 데이터 셋 iris_data = load_iris() # 2. 데이터 분리(학습 데이터와 예측 데이터) x_train, x_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.2) # 3. 모델 생성 => decision tree model = DecisionTreeClassifier(random_state=42) # 4. 데이터 학습: clf.fit(2차원, 1차원)) model.fit(x_train, y_train) # DecisionTreeClassifier 클래스 정보 # print(help(DecisionTreeClassifier)) # DecisionTreeClassifier의 Hyper Parameter # max_depth 트리 최대 깊이, max_features 최대 feature 수, min_sample_split 노드를 분할하기 위한 최소의 샘플 수 print(np.round(model.feature_importances_, 2)) # [0.01 0.05 0.55 0.4] = [sepal length, sepal width, petal length, petal width] # 추가 메서드 print(model.get_depth()) print(model.get_n_leaves()) print(model.get_params()) print(model.decision_path(x_train)) print(model.score(x_test, y_test)) # accuracy_score와 같은 기능
# - threshold, threshold value at the node # # Using those arrays, we can parse the tree structure: n_nodes = clf.tree_.node_count children_left = clf.tree_.children_left children_right = clf.tree_.children_right feature = clf.tree_.feature threshold = clf.tree_.threshold # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. node_indicator = clf.decision_path(X_train) # 获取训练集规则路径 d_paths = node_indicator.todense() # 规则路径去重并统计对应路径的出现次数 d_uniques, d_idxs, d_counts, = np.unique(d_paths, axis=0, return_counts=True, return_index=True) # 规则打印 print('\nThe most precise rules are the following:') # i = 0 # for rule in d_uniques: for i, item in enumerate(d_uniques): count_max_idx = np.argmax(d_counts) rule = d_uniques[count_max_idx] # 获取通过次数最多的路径 print("\nRules_{0}, passed counts:{1}".format(i, d_counts.max()))
node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i], )) print() """ # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. """ node_indicator = tree.decision_path(X) # Similarly, we can also have the leaves ids reached by each sample. leave_id = tree.apply(X) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. #sample_id = 0 #node_index = node_indicator.indices[node_indicator.indptr[sample_id]: # node_indicator.indptr[sample_id + 1]] # #print('Rules used to predict sample %s: ' % sample_id) #for node_id in node_index: # if leave_id[sample_id] == node_id: # continue
def _prune_reduced_error( cls, model: DecisionTreeClassifier, X: numpy.array, y: numpy.array, step_score_drop: float = 0, max_score_drop: float = 0) -> DecisionTreeClassifier: def _prune_tree(tree, node_to_prune): child_left = tree.children_left[node_to_prune] child_right = tree.children_right[node_to_prune] tree.children_left[child_left] = Tree.TREE_UNDEFINED tree.children_left[child_right] = Tree.TREE_UNDEFINED tree.children_right[child_left] = Tree.TREE_UNDEFINED tree.children_right[child_right] = Tree.TREE_UNDEFINED tree.children_left[node_to_prune] = Tree.TREE_LEAF tree.children_right[node_to_prune] = Tree.TREE_LEAF tree.feature[node_to_prune] = Tree.TREE_UNDEFINED model = deepcopy(model) tree = model.tree_ changes = True checked = set() parents = { x: i for i, x in enumerate(tree.children_left) if x != Tree.TREE_LEAF } parents.update({ x: i for i, x in enumerate(tree.children_right) if x != Tree.TREE_LEAF }) leaves = list(numpy.where(tree.children_left == Tree.TREE_LEAF)[0]) decision_path = { leaf: d.nonzero()[1] for leaf, d in zip(leaves, model.decision_path(X).T[leaves]) } y_predicted = model.predict(X) init_score = current_score = accuracy_score(y, y_predicted) while changes: changes = False for leaf_index, leaf1 in enumerate(leaves): if leaf1 not in parents: continue parent = parents[leaf1] if parent in checked: continue leaf2 = tree.children_right[parent] leaf2 = leaf2 if leaf2 != leaf1 else tree.children_left[parent] if tree.children_left[leaf2] != Tree.TREE_LEAF or \ tree.children_right[leaf2] != Tree.TREE_LEAF: continue data_leaf1_index = decision_path[leaf1] data_leaf2_index = decision_path[leaf2] data_parent_index = numpy.concatenate( (data_leaf1_index, data_leaf2_index)) y_predicted_leaf1 = model.classes_[numpy.argmax( tree.value[leaf1, 0, :])] y_predicted_leaf2 = model.classes_[numpy.argmax( tree.value[leaf2, 0, :])] new_y = model.classes_[numpy.argmax(tree.value[parent, 0, :])] score_delta = (numpy.sum(new_y == y[data_parent_index]) - numpy.sum(y_predicted_leaf1 == y[data_leaf1_index]) - numpy.sum(y_predicted_leaf2 == y[data_leaf2_index])) \ / X.shape[0] if init_score != 0 and score_delta / init_score < max_score_drop or \ current_score != 0 and score_delta / current_score < step_score_drop: checked.add(parent) continue else: current_score += score_delta leaves.remove(leaf2) leaves[leaf_index] = parent _prune_tree(tree, parent) y_predicted[data_parent_index] = new_y del decision_path[leaf1], decision_path[leaf2] decision_path[parent] = data_parent_index changes = True break return model
is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() depth[node_id] = parent_depth + 1 # If we have a test node if (child_left[node_id] != child_right[node_id]): stack.append((child_left[node_id], parent_depth + 1)) stack.append((child_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True print(" \nThe binary tree structure has %s nodes" % n_nodes) node_indicator = classifier.decision_path(X_test) # Similarly, we can also have the leaves ids reached by each sample. leave_id = classifier.apply(X_test) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[ node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]] #print(test_accuracy) max_test_accuracy_pruning = 0 depth = classifier.tree_.max_depth
def test_decision_path_hardcoded(): X = iris.data y = iris.target est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y) node_indicator = est.decision_path(X[:2]).toarray() assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])