Example #1
0
def compute_tree_ensemble_accuracy(trees, X_test, y_test):
    y_pred_prob = np.zeros(len(list(y_test)))
    # print("local trees size:", len(trees))
    weights_list = prepare_uniform_weights(2, len(trees))
    weights_norm = normalize_weights(weights_list)
    # print("len of weights norm: ", weights_norm.size())
    # print("weights norm:", weights_norm)
    out_weight = None
    for tree_id, tree in enumerate(trees):
        pred = tree.predict_proba(X_test)
        # pred (n_samples, n_classes)
        if out_weight is None:
            out_weight = weights_norm[tree_id] * torch.tensor(
                pred, dtype=torch.float)
        else:
            out_weight += weights_norm[tree_id] * torch.tensor(
                pred, dtype=torch.float)

    _, pred_label = torch.max(out_weight.data, 1)
    # print("pred label:", pred_label)
    # print("y test:", y_test)
    # print("out weight:", out_weight)
    # print("len of out weight:", len(out_weight))
    correct_num = 0
    # print(pred_label == torch.BoolTensor(y_test))
    correct_num += (pred_label == torch.LongTensor(y_test)).sum().item()

    # print("correct num:", correct_num)
    # for i, pred_i in enumerate(out_weight):
    #     pred_class = np.argmax(pred_i)
    #     if pred_class == y_test[i]:
    #         correct_num += 1
    total = len(list(y_test))
    acc = correct_num / total
    return acc
Example #2
0
def best_prediction_threshold(tree, X_test, y_test, lower_bound, gap):
    '''
    This function is used to find the best threshold to classified the
    predicted probability, using best accuracy or f1_score.
    Inputs:
        tree: decision tree
        X_test: dataframe of features used for testing
        y_test: series of classifiers used for testing
    Returns:
        y_pred: the predictions of classifier using training data
        best_accuracy: accuracy of all cases
        current_acc0: accuracy of classifier which is labelled 0 in 
                      testing data
        current_acc1: accuracy of classifier which is labelled 1 in 
                      testing data
    '''
    gn, g0n, g1n = len(y_test), list(y_test).count(0), list(y_test).count(1)
    y_predp = tree.predict_proba(X_test)[:, 1]
    best_accuracy = 0

    for threshold in np.arange(lower_bound, 1, gap):
        y_predp = pd.Series(y_predp > threshold)
        y_pred = pd.Series([0] * len(y_predp))
        y_pred.loc[y_predp] = 1
        evaluated = list(zip(y_pred, y_test))

        acc_all, acc_0, acc_1 = accuracy_calculation(gn, g0n, g1n, evaluated)
        if acc_all > best_accuracy:
            best_accuracy = acc_all
            current_acc0, current_acc1 = acc_0, acc_1
            best_threshold = threshold

    return best_threshold, best_accuracy, current_acc0, current_acc1
Example #3
0
def yaraified_rf_prediction(rf, tree_thres, percent_match, X):
    results = []
    for tree in rf.estimators_:
        results.append(list(tree.predict_proba(X)[:, -1] > tree_thres))
    results = np.array(results)
    results = results.transpose().sum(axis=1) / len(rf.estimators_)
    return results
Example #4
0
def tree_predicition(tree, input_data):
    #run tree prediciton
    prediction = tree.predict_proba(input_data)
    #get order of output classes
    classes = tree.classes_
    output = {}
    #map probability to output class
    for p, c in zip(prediction[0], classes):
        output[c] = p
    return output
Example #5
0
def fedboost(trees, args, net_dataidx_map, X_train, y_train, X_test, y_test,
             task_type):
    for party_id in range(args.n_parties):
        dataidxs = net_dataidx_map[party_id]
        X_train_local = X_train[dataidxs]
        y_train_local = y_train[dataidxs]
        current_pred = np.zeros((len(y_train_local), 2))
        ensemble_tree_ids = np.zeros(args.n_ensemble_models, dtype=int)
        isselected = np.zeros(len(trees), dtype=int)
        for final_tree_id in range(args.n_ensemble_models):
            temp_loss = float("inf")
            temp_tree_id = -1
            for tree_id, tree in enumerate(trees):
                if isselected[tree_id] == 1:
                    continue
                if task_type == "binary_cls":
                    temp_pred = current_pred + tree.predict_proba(
                        X_train_local)
                    current_pred_norm = preprocessing.normalize(temp_pred,
                                                                axis=1,
                                                                norm='l1')
                    current_loss = metrics.log_loss(y_train_local,
                                                    current_pred_norm)
                    if tree_id in range(party_id * args.n_local_models,
                                        (party_id + 1) * args.n_local_models):
                        current_loss += args.lambda_boost
                    if current_loss < temp_loss:
                        temp_loss = current_loss
                        temp_tree_id = tree_id
                elif task_type == "reg":
                    print("not supported yet!")
                    exit(1)
            ensemble_tree_ids[final_tree_id] = temp_tree_id
            current_pred += args.lr * trees[temp_tree_id].predict_proba(
                X_train_local)
            isselected[temp_tree_id] = 1
        ens_acc = compute_tree_ensemble_accuracy(
            [trees[i] for i in ensemble_tree_ids], X_test, y_test)
        logger.info("In party %d" % party_id)
        logger.info("Selected trees %s" %
                    " ".join(str(e) for e in ensemble_tree_ids))
        logger.info("Boost acc: %f" % ens_acc)
Example #6
0
def produce_probabilities(trees, X):
    probas = []
    for tree in trees:
        probas.append([p[1] for p in tree.predict_proba(X)])
    return np.array(probas).transpose()
Example #7
0
# Clase 0: Iris-Setosa,
#       1: Iris-Versicolor
#       2: Iris-Virginica
print("Primeras 10 variables objetivo")
print(iris.target[0:150])

# Crear arbol
arbol = DecisionTreeClassifier(max_depth=5, random_state=100)
# Cantidad de niveles: 5
arbol.fit(iris.data, iris.target)  # entrenamiento del arbol

# Obtener predicciones
print("PREDICCIONES ----------------------------")
print(arbol.predict(iris.data[100:140]))
# tree.plot_tree(arbol)

r = export_text(decision_tree, feature_names=iris['feature_names'])
print(r)

# Si queremos saber las probabilidades podemos usar el metodo predict_proba
print(tree.predict_proba(iris.data[47:53]))

# la primera clase (Setosa) es la primera columna, la segunda clase en la segunda, etc.
# este es el resultado:
# [[1.         0.         0.        ]
#  [1.         0.         0.        ]
#  [1.         0.         0.        ]
#  [0.         0.90740741 0.09259259]
#  [0.         0.90740741 0.09259259]
#  [0.         0.90740741 0.09259259]]
Example #8
0
    clf = tree.DecisionTreeClassifier()
    tree = clf.fit(data, target)

    regiao = 'Grande Florianópolis'
    localizacao = 'URBANA'
    serie = '8º ano'

    regiao_enc = encoders['regiao'].transform([regiao])[0]
    localizacao_enc = encoders['localizacao'].transform([localizacao])[0]
    serie_enc = encoders['serie'].transform([serie])[0]

    prediction_enc = tree.predict([[regiao_enc, localizacao_enc, serie_enc]])
    prediction = encoders['status'].inverse_transform(int(prediction_enc[0]))

    proba = tree.predict_proba([[regiao_enc, localizacao_enc, serie_enc]])[0]

    # Output
    print('\nAcurácia do classificador: ' + str(tree.score(data, target)))

    print('\nA predição retornou: ' + prediction)
    if prediction == 'Suficiente':
        print('O aluno atende aos pré requisitos do Bolsa Família.')
    else:
        print(
            'O aluno não atende aos pré requisitos do Bolsa Família e deve ser desligado do programa.'
        )

    print('\nPeso de cada variável:')
    for i in range(len(default_csv[0]) - 1):
        print('\t' + default_csv[0][i] + ': ' +
# naive bayes (58%)

nb = MultinomialNB()
grid = GridSearchCV(nb, {}, cv=5)
grid.fit(X_small, y)

# SVM

clf = svm.SVC()
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': [.000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000, 10000], 'C': [float(x) for x in range(1,50)], 'degree': range(1,20)}
grid = GridSearchCV(clf, param_grid , cv=5)
grid.fit(X_small, y)

smoothie_tree.predict(c[smoothie_features])

tree.predict_proba(np.array(c.ix[1][smoothie_features]))

# write a program that takes in a time and returns the advantageous areas (zips) of SF and the disadvantageous areas (zips)

SAN_FRANCISCO_ZIP_CODES = [94102, 94109, 94123, 94117, 94134, 94112, 94124, 94121, 94133, 94116, 94115, 94110, 94127, 94114, 94107, 94132, 94122, 94103, 94105, 94104, 94108, 94118, 94158, 94111, 94131, 94130, 94014, 94129, 94015]
# [94014, 94015] not present in sf zip codes I parsed for application


time = datetime.now()

WEEKEND_DAYS    = ['Saturday', 'Sunday']
EARLY_MORNING   = [5,6,7]
LATE_MORNING    = [8,9,10]
EARLY_AFTERNOON = [11,12,13]
LATE_AFTERNOON  = [14,15,16]
EARLY_EVENING   = [17,18,19]
Example #10
0
Y1B

clfR2 = tree.DecisionTreeRegressor()
clfR2 = clf2.fit(X1A, Y1A)
clfR2.predict([[26, 2]])   #salary expected is 30 
clfR2.predict_proba()

clfC2 = tree.DecisionTreeClassifier()
clfC2 = clf.fit(X1A, Y1B)
Y1B1 = np.array(pd.get_dummies(data['mnc']))
Y1B1
clfC2 = clf.fit(X1A, Y1B1)
clfC2.predict([[26, 2]])
data
clfC2.predict(X1A)
tree.predict_proba(clfC2)

data
IVs = ['age','experience']
DV1 = ['salary']
DV2 = ['mnc']

#plotting trees
#lot_tree(*args, decision_tree, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rotate='deprecated', rounded=False, precision=3, ax=None, fontsize=None)

#regression tree
tree.plot_tree(clfR2)#class names not avl
tree.plot_tree(clfR2, feature_names=IVs, class_names=DV1, filled=True, node_ids=True, proportion=True, fontsize=10)

#classification tree
tree.plot_tree(clfC2)
Xtrain = trainM[:, 1:]
ytest = testM[:, 0]
Xtest = testM[:, 1:]

#Grid search with param grid having different set of parameters
param_grid = {
    'max_depth': np.arange(3, 10),
    'min_impurity_split': np.arange(1, 5),
    'min_samples_leaf': np.arange(1, 5),
    'min_samples_split': np.arange(3, 10)
}

tree = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid)

tree.fit(Xtrain, ytrain)
tree_preds = tree.predict_proba(Xtest)[:, 1]

print("Best accuracy possible and best parameters to achieve them ")

print(tree.best_score_, tree.best_params_)

#calculating weights based on imbalanced data
class_weight = class_weight.compute_class_weight('balanced', np.unique(ytrain),
                                                 ytrain)
cw = {
    1: class_weight[0],
    2: class_weight[1],
    3: class_weight[2],
    4: class_weight[3],
    5: class_weight[4],
    6: class_weight[5],
Example #12
0
	def predict_proba(self, X_test):
		Y_pred = np.zeros(len(X_test))
		for tree in self.tree_bags:
			Y_pred += tree.predict_proba(X_test)[:,1]
		return 1.0*Y_pred / self.n_tree
Example #13
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn import tree
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, y_train)
p_n = neigh.predict_proba(X_valid)  #, y_valid)

svm = SVC(probability=True)
svm.fit(X_train, y_train)
p_svm = svm.predict_proba(X_valid)

tree = tree.DecisionTreeClassifier()
tree.fit(X_train, y_train)
p_tree = tree.predict_proba(X_valid)

##############################################
###       CNN  ##

##import keras
##from keras.datasets import mnist
##from keras.models import Sequential
##from keras.layers import Dense, Dropout, Flatten
##from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
##from keras import backend as K
##from keras.callbacks import ReduceLROnPlateau, EarlyStopping
##
##batch_size = 128
##num_classes = len(classes)
##epochs = 20
Example #14
0
def obtain_predicted_probabilities(tree, test_df, xcol):
    '''
    Obtain predicted probabilities of success for test data given a tree
    classifier and a list of the x columns. 
    '''
    return tree.predict_proba(test_df[xcol])[:,1]
scores = []
f1 = []
for i in classifiers:
    print i
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    cm = metrics.confusion_matrix(y_test, y_pred)
    scores.append((cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm)))
    f1.append(skl.metrics.f1_score(y_test, y_pred))

results = pd.DataFrame(zip(scores,f1))

tree = DecisionTreeClassifier(criterion='gini',max_depth=15)
tree.fit(X_train, y_train)
y_pred = tree.predict_proba(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
(cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm))
skl.metrics.f1_score(y_test, y_pred)

y_pred[:,1]
from sklearn.metrics import roc_curve
len(tpr1)

fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred[:,1])
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(fpr1, tpr1, lw=1, label='Curva ROC')
plt.axis([-0.05, 1.05, -0.05, 1.05])
plt.title("Curva Roc")
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Suerte')
Example #16
0
def train_a_student_tree(trees,
                         public_data,
                         public_data_label,
                         n_classes,
                         stu_model,
                         gamma,
                         filter_query,
                         threshold=None,
                         n_partition=None,
                         apply_consistency=False,
                         is_final_student=False):
    vote_counts = np.zeros((len(public_data_label), n_classes))
    for tree_id, tree in enumerate(trees):
        y_pred = tree.predict(public_data)
        y_prob = tree.predict_proba(public_data)
        # print("y_pred:", y_pred)
        if is_final_student and apply_consistency:
            if tree_id % n_partition == 0:
                votes_base = y_pred
                votes_flag = np.ones(len(y_pred), dtype=int)
            else:
                for i, y in enumerate(y_pred):
                    if votes_flag[i]:
                        if int(y) != votes_base[i]:
                            votes_flag[i] = 0
                    if (tree_id % n_partition) == (n_partition -
                                                   1) and votes_flag[i]:
                        vote_counts[i][int(y)] += n_partition
        else:
            for i, y in enumerate(y_pred):
                if threshold is not None:
                    if y_prob[i] >= threshold:
                        vote_counts[i][int(y)] += 1
                else:
                    vote_counts[i][int(y)] += 1
    vote_counts_origin = copy.deepcopy(vote_counts).astype("int")

    if gamma != 0:
        for i in range(vote_counts.shape[0]):
            vote_counts[i] += np.random.laplace(loc=0.0,
                                                scale=float(1.0 / gamma),
                                                size=vote_counts.shape[1])
    final_pred = np.argmax(vote_counts, axis=1)
    logger.info(
        "Labeling acc %f" %
        ((final_pred == public_data_label).sum() / len(public_data_label)))

    if filter_query:
        confident_query_idx = []
        for idx, row in enumerate(vote_counts_origin):
            top2_counts = row[np.argsort(row)[-2:]]
            if top2_counts[1] - top2_counts[0] > 2:
                # if top2_counts[1] > args.n_teacher_each_partition * args.query_filter_threshold:
                confident_query_idx.append(idx)

        print("len confident query idx:", len(confident_query_idx))
        logger.info("len confident query idx: %d" % len(confident_query_idx))
        # local_query_ds = data.Subset(public_ds, confident_query_idx)

        public_data = public_data[confident_query_idx]
        final_pred = [final_pred[i] for i in confident_query_idx]
        # query_data_size = int(len(y_test) * args.query_portion)

    stu_model.fit(public_data, final_pred)

    top1_class_counts = np.zeros(500)
    top2_class_counts = np.zeros(500)
    top_diff_counts = np.zeros(500)
    top2_counts_differ_one = 0
    for row in vote_counts_origin:
        # print(row)
        top2_counts = row[np.argsort(row)[-2:]]
        if top2_counts[1] - top2_counts[0] <= 1:
            top2_counts_differ_one += 1
        # print(top2_counts[1] - top2_counts[0])
        top_diff_counts[top2_counts[1] - top2_counts[0]] += 1
        top1_class_counts[top2_counts[1]] += 1
        top2_class_counts[top2_counts[0]] += 1

    return top2_counts_differ_one, vote_counts_origin
Example #17
0
            yticklabels=['Non-default', 'Default'])
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title("Confusion Matrix - Decision Tree")

# In[55]:

print(classification_report(y_test, y_pred_DT))

# In[56]:

import os

# In[57]:

y_pred_proba_DT = tree.predict_proba(X_test)[::, 1]
fpr_DT, tpr_DT, _ = metrics.roc_curve(y_test, y_pred_proba_DT)
auc_DT = metrics.roc_auc_score(y_test, y_pred_proba_DT)

# In[ ]:

# In[ ]:

# In[61]:

plt.figure(figsize=(10, 7))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_DT, tpr_DT, label="Decision Tree, auc=" + str(round(auc_DT, 2)))
plt.legend(loc=4, title='Models', facecolor='white')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
pre_score = lr.predict_proba(X_test)[:, 1]
print("对数几率回归模型参数:{parameter}".format(parameter=lr))
auto_model_analysis("logistic_regression", Y_train, train_pred, Y_test,
                    test_pred, pre_score)

# In[17]:

#决策树
from sklearn import tree
tree = tree.DecisionTreeClassifier(criterion="entropy",
                                   splitter='best',
                                   max_depth=4)
tree.fit(X_train, Y_train)
train_pred = tree.predict(X_train)
test_pred = tree.predict(X_test)
pre_score = tree.predict_proba(X_test)[:, 1]
print("决策树模型参数:{parameter}".format(parameter=tree))
auto_model_analysis("decision tree", Y_train, train_pred, Y_test, test_pred,
                    pre_score)

# In[18]:

#神经网络
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='sgd',
                    activation='tanh',
                    hidden_layer_sizes=(7, 28, 56, 2),
                    alpha=0.05,
                    learning_rate_init=0.02,
                    max_iter=10000)
mlp.fit(X_train, Y_train)
Example #19
0
def export_decision_path2(random_tree,
                          x,
                          out_file=None,
                          feature_names=None,
                          label='all',
                          special_characters=False,
                          node_ids=False,
                          rounded=True,
                          proportion=False,
                          impurity=True,
                          class_names=None):
    def recurse(the_tree, node_id):
        if node_id == _tree.TREE_LEAF:
            raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
        left_child = the_tree.children_left[node_id]
        right_child = the_tree.children_right[node_id]
        if left_child != _tree.TREE_LEAF:
            child2parent[left_child] = (LEFT, node_id)
            child2parent[right_child] = (RIGHT, node_id)
            recurse(the_tree, left_child)
            recurse(the_tree, right_child)
        else:
            leafs.append(node_id)

    def node_to_str(tree, node, criterion):
        node_id = node[1]
        node_pos = node[0]
        # Generate the node content string
        if tree.n_outputs == 1:
            value = tree.value[node_id][0, :]
        else:
            value = tree.value[node_id]

        # Should labels be shown?
        labels = (label == 'root' and node_id == 0) or label == 'all'

        # PostScript compatibility for special characters
        if special_characters:
            characters = ['&#35;', '<SUB>', '</SUB>', '&le;', '<br/>', '>']
            node_string = '<'
        else:
            characters = ['#', '[', ']', '<=', '\\n', '"', '>']
            node_string = '"'

        # Write node ID
        if node_ids:
            if labels:
                node_string += 'node '
            node_string += characters[0] + str(node_id) + characters[4]

        # Write decision criteria
        if tree.children_left[node_id] != _tree.TREE_LEAF:
            # Always write node decision criteria, except for leaves
            if feature_names is not None:
                feature = feature_names[tree.feature[node_id]]
            else:
                feature = "X%s%s%s" % (characters[1], tree.feature[node_id],
                                       characters[2])
            node_string += '%s %s %s%s' % (
                feature, characters[3] if node_pos == 1 else characters[6],
                round(tree.threshold[node_id], 4), characters[4])

        # Write impurity
        if impurity:
            if isinstance(criterion, _criterion.FriedmanMSE):
                criterion = "friedman_mse"
            elif not isinstance(criterion, six.string_types):
                criterion = "impurity"
            if labels:
                node_string += '%s = ' % criterion
            node_string += (str(round(tree.impurity[node_id], 4)) +
                            characters[4])

        # Write node sample count
        if labels:
            node_string += 'samples = '
        if proportion:
            percent = (100. * tree.n_node_samples[node_id] /
                       float(tree.n_node_samples[0]))
            node_string += (str(round(percent, 1)) + '%' + characters[4])
        else:
            node_string += (str(tree.n_node_samples[node_id]) + characters[4])

        # Write node class distribution / regression value
        if proportion and tree.n_classes[0] != 1:
            # For classification this will show the proportion of samples
            value = value / tree.weighted_n_node_samples[node_id]
        if labels:
            node_string += 'value = '
        if tree.n_classes[0] == 1:
            # Regression
            value_text = np.around(value, 4)
        elif proportion:
            # Classification
            value_text = np.around(value, 2)
        elif np.all(np.equal(np.mod(value, 1), 0)):
            # Classification without floating-point weights
            value_text = value.astype(int)
        else:
            # Classification with floating-point weights
            value_text = np.around(value, 4)
        # Strip whitespace
        value_text = str(value_text.astype('S32')).replace("b'", "'")
        value_text = value_text.replace("' '", ", ").replace("'", "")
        if tree.n_classes[0] == 1 and tree.n_outputs == 1:
            value_text = value_text.replace("[", "").replace("]", "")
        value_text = value_text.replace("\n ", characters[4])
        node_string += value_text + characters[4]

        # Write node majority class
        if (class_names is not None and tree.n_classes[0] != 1
                and tree.n_outputs == 1):
            # Only done for single-output classification trees
            if labels:
                node_string += 'class = '
            if class_names is not True:
                class_name = class_names[np.argmax(value)]
            else:
                class_name = "y%s%s%s" % (characters[1], np.argmax(value),
                                          characters[2])
            node_string += class_name

        # Clean up any trailing newlines
        if node_string[-2:] == '\\n':
            node_string = node_string[:-2]
        if node_string[-5:] == '<br/>':
            node_string = node_string[:-5]

        return node_string + characters[5]

    # open out file
    return_string = False
    own_file = False
    if isinstance(out_file, six.string_types):
        if six.PY3:
            out_file = open(out_file, "w", encoding="utf-8")
        else:
            out_file = open(out_file, "wb")
        own_file = True
    if out_file is None:
        return_string = True
        out_file = six.StringIO()

    out_file.write('digraph Decision_path {\n')
    out_file.write('rankdir = LR;\n')
    out_file.write('node [shape=box];\n')

    scores = []
    for tree in random_tree.estimators_:
        scores.append((tree, float(tree.predict_proba(x)[:, 1])))
    iter = 0
    for each in sorted(scores, key=lambda x: x[1], reverse=True)[0:10]:
        child2parent = {}
        leafs = []
        recurse(each[0].tree_, 0)

        leaf_of_path = -1
        path = each[0].decision_path(x)[0].todense()[0, :].tolist()[0]
        print(path)
        idx = len(path) - 1
        while True:
            if path[idx] == 1:
                leaf_of_path = idx
                break
            idx -= 1

        for leaf in leafs:
            if leaf != leaf_of_path:
                idx += 1
                continue
            path = []
            cur_node = (0, leaf)
            while True:
                path.append(cur_node)
                if cur_node[1] == 0:
                    break
                cur_node = child2parent[cur_node[1]]
            path.reverse()
            for node in path:
                out_file.write(
                    'f%dt%d [label=%s];\n' %
                    (iter, node[1],
                     node_to_str(each[0].tree_, node, each[0].criterion)))
                if node[1] != 0:
                    out_file.write(
                        'f%dt%d -> f%dt%d;\n' %
                        (iter, child2parent[node[1]][1], iter, node[1]))
        iter += 1
    out_file.write('}')

    if return_string:
        return out_file.getvalue()
    if own_file:
        out_file.close()
Example #20
0
def export_decision_path2(random_tree, x, out_file=None, feature_names=None,
                         label='all',
                         special_characters=False,
                         node_ids = False,
                         rounded = True,
                         proportion = False,
                         impurity = True,
                         class_names=None):
    def recurse(the_tree, node_id):
        if node_id == _tree.TREE_LEAF:
            raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
        left_child = the_tree.children_left[node_id]
        right_child = the_tree.children_right[node_id]
        if left_child != _tree.TREE_LEAF:
            child2parent[left_child] = (LEFT, node_id)
            child2parent[right_child] = (RIGHT, node_id)
            recurse(the_tree, left_child)
            recurse(the_tree, right_child)
        else:
            leafs.append(node_id)

    def node_to_str(tree, node, criterion):
        node_id = node[1]
        node_pos = node[0]
        # Generate the node content string
        if tree.n_outputs == 1:
            value = tree.value[node_id][0, :]
        else:
            value = tree.value[node_id]

        # Should labels be shown?
        labels = (label == 'root' and node_id == 0) or label == 'all'

        # PostScript compatibility for special characters
        if special_characters:
            characters = ['&#35;', '<SUB>', '</SUB>', '&le;', '<br/>', '>']
            node_string = '<'
        else:
            characters = ['#', '[', ']', '<=', '\\n', '"', '>']
            node_string = '"'

        # Write node ID
        if node_ids:
            if labels:
                node_string += 'node '
            node_string += characters[0] + str(node_id) + characters[4]

        # Write decision criteria
        if tree.children_left[node_id] != _tree.TREE_LEAF:
            # Always write node decision criteria, except for leaves
            if feature_names is not None:
                feature = feature_names[tree.feature[node_id]]
            else:
                feature = "X%s%s%s" % (characters[1],
                                       tree.feature[node_id],
                                       characters[2])
            node_string += '%s %s %s%s' % (feature,
                                           characters[3] if node_pos == 1 else characters[6],
                                           round(tree.threshold[node_id], 4),
                                           characters[4])

        # Write impurity
        if impurity:
            if isinstance(criterion, _criterion.FriedmanMSE):
                criterion = "friedman_mse"
            elif not isinstance(criterion, six.string_types):
                criterion = "impurity"
            if labels:
                node_string += '%s = ' % criterion
            node_string += (str(round(tree.impurity[node_id], 4)) +
                            characters[4])

        # Write node sample count
        if labels:
            node_string += 'samples = '
        if proportion:
            percent = (100. * tree.n_node_samples[node_id] /
                       float(tree.n_node_samples[0]))
            node_string += (str(round(percent, 1)) + '%' +
                            characters[4])
        else:
            node_string += (str(tree.n_node_samples[node_id]) +
                            characters[4])

        # Write node class distribution / regression value
        if proportion and tree.n_classes[0] != 1:
            # For classification this will show the proportion of samples
            value = value / tree.weighted_n_node_samples[node_id]
        if labels:
            node_string += 'value = '
        if tree.n_classes[0] == 1:
            # Regression
            value_text = np.around(value, 4)
        elif proportion:
            # Classification
            value_text = np.around(value, 2)
        elif np.all(np.equal(np.mod(value, 1), 0)):
            # Classification without floating-point weights
            value_text = value.astype(int)
        else:
            # Classification with floating-point weights
            value_text = np.around(value, 4)
        # Strip whitespace
        value_text = str(value_text.astype('S32')).replace("b'", "'")
        value_text = value_text.replace("' '", ", ").replace("'", "")
        if tree.n_classes[0] == 1 and tree.n_outputs == 1:
            value_text = value_text.replace("[", "").replace("]", "")
        value_text = value_text.replace("\n ", characters[4])
        node_string += value_text + characters[4]

        # Write node majority class
        if (class_names is not None and
                    tree.n_classes[0] != 1 and
                    tree.n_outputs == 1):
            # Only done for single-output classification trees
            if labels:
                node_string += 'class = '
            if class_names is not True:
                class_name = class_names[np.argmax(value)]
            else:
                class_name = "y%s%s%s" % (characters[1],
                                          np.argmax(value),
                                          characters[2])
            node_string += class_name

        # Clean up any trailing newlines
        if node_string[-2:] == '\\n':
            node_string = node_string[:-2]
        if node_string[-5:] == '<br/>':
            node_string = node_string[:-5]

        return node_string + characters[5]

    # open out file
    return_string = False
    own_file = False
    if isinstance(out_file, six.string_types):
        if six.PY3:
            out_file = open(out_file, "w", encoding="utf-8")
        else:
            out_file = open(out_file, "wb")
        own_file = True
    if out_file is None:
        return_string = True
        out_file = six.StringIO()

    out_file.write('digraph Decision_path {\n')
    out_file.write('rankdir = LR;\n')
    out_file.write('node [shape=box];\n')

    scores = []
    for tree in random_tree.estimators_:
        scores.append((tree, float(tree.predict_proba(x)[:,1])))
    iter = 0
    for each in sorted(scores, key = lambda x : x[1], reverse = True)[0:10]:
        child2parent = {}
        leafs = []
        recurse(each[0].tree_, 0)

        leaf_of_path = -1
        path = each[0].decision_path(x)[0].todense()[0,:].tolist()[0]
        print(path)
        idx = len(path) - 1
        while True:
            if path[idx] == 1:
                leaf_of_path = idx
                break
            idx -= 1

        for leaf in leafs:
            if leaf != leaf_of_path:
                idx += 1
                continue
            path = []
            cur_node = (0, leaf)
            while True:
                path.append(cur_node)
                if cur_node[1] == 0:
                    break
                cur_node = child2parent[cur_node[1]]
            path.reverse()
            for node in path:
                out_file.write('f%dt%d [label=%s];\n'
                               % (iter, node[1], node_to_str(each[0].tree_, node, each[0].criterion)))
                if node[1] != 0:
                    out_file.write('f%dt%d -> f%dt%d;\n' % (iter, child2parent[node[1]][1], iter, node[1]))
        iter += 1
    out_file.write('}')

    if return_string:
        return out_file.getvalue()
    if own_file:
        out_file.close()