def test_rf_classifier_decision_path_leaf(self):
     model = RandomForestClassifier(n_estimators=3, max_depth=3)
     X, y = make_classification(3, n_features=4, random_state=42)
     X = X[:, :2]
     model.fit(X, y)
     initial_types = [('input', FloatTensorType((None, X.shape[1])))]
     model_onnx = convert_sklearn(model,
                                  initial_types=initial_types,
                                  options={
                                      id(model): {
                                          'decision_leaf': True,
                                          'decision_path': True,
                                          'zipmap': False
                                      }
                                  },
                                  target_opset=TARGET_OPSET)
     sess = InferenceSession(model_onnx.SerializeToString())
     res = sess.run(None, {'input': X.astype(numpy.float32)})
     pred = model.predict(X)
     assert_almost_equal(pred, res[0].ravel())
     dec = model.decision_path(X)
     exp_leaf = path_to_leaf(model.estimators_, dec[0].todense(), dec[1])
     exp_path = binary_array_to_string(dec[0].todense())
     got_path = numpy.array([''.join(row) for row in res[2]])
     assert exp_path == got_path.ravel().tolist()
     assert exp_leaf.tolist() == res[3].tolist()
def test_drf_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.RandomForestClassifier

    #Run h2o4gpu version of RandomForest Regression
    drf = Solver(backend=backend, random_state=1234, oob_score=True)
    print("h2o4gpu fit()")
    drf.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import RandomForestClassifier
    drf_sk = RandomForestClassifier(random_state=1234, oob_score=True, max_depth=3)
    print("Scikit fit()")
    drf_sk.fit(X, y)

    if backend == "sklearn":
        assert (drf.predict(X) == drf_sk.predict(X)).all() == True
        assert (drf.predict_log_proba(X) == drf_sk.predict_log_proba(X)).all() == True
        assert (drf.predict_proba(X) == drf_sk.predict_proba(X)).all() == True
        assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True
        assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]).all() == True
        assert (drf.apply(X) == drf_sk.apply(X)).all() == True

        print("Estimators")
        print(drf.estimators_)
        print(drf_sk.estimators_)

        print("n_features")
        print(drf.n_features_)
        print(drf_sk.n_features_)
        assert drf.n_features_ == drf_sk.n_features_

        print("n_classes_")
        print(drf.n_classes_)
        print(drf_sk.n_classes_)
        assert drf.n_classes_ == drf_sk.n_classes_

        print("n_features")
        print(drf.classes_)
        print(drf_sk.classes_)
        assert (drf.classes_ == drf_sk.classes_).all() == True

        print("n_outputs")
        print(drf.n_outputs_)
        print(drf_sk.n_outputs_)
        assert drf.n_outputs_ == drf_sk.n_outputs_

        print("Feature importance")
        print(drf.feature_importances_)
        print(drf_sk.feature_importances_)
        assert (drf.feature_importances_ == drf_sk.feature_importances_).all() == True

        print("oob_score")
        print(drf.oob_score_)
        print(drf_sk.oob_score_)
        assert drf.oob_score_ == drf_sk.oob_score_
    def generate_mapping(self, X, y):
        X = self.dummy_encoder.transform(X.copy(deep=True))
        y = y.copy(deep=True)

        mapping = []

        for switch in self.dummy_encoder.mapping:
            col = switch.get('col')
            values = switch.get('mapping').copy(deep=True)

            if isinstance(self.max_depth, int):
                max_depth = self.max_depth
            elif isinstance(self.max_depth, float):
                max_depth = round(self.max_depth * values.shape[1])
            else:
                max_depth = min(self.max_depth[1],
                                round(self.max_depth[0] * values.shape[1]))
            if max_depth == 0:
                continue

            forest = RandomForestClassifier(
                max_depth=max_depth,
                n_estimators=self.n_estimators,
                n_jobs=self.n_jobs,
            )

            forest.fit(X[values.columns], y)

            subsets = self.get_subsets(forest.decision_path(values))
            subset_df = pd.DataFrame(data=subsets,
                                     index=values.index,
                                     columns=[
                                         '{col}_subset_{i}'.format(col=col,
                                                                   i=i)
                                         for i in range(subsets.shape[1])
                                     ])

            base_df = values.join(subset_df)

            mapping.append({'col': col, 'mapping': base_df})

        return mapping
Exemple #4
0
 def test_randomforestclassifier_decision_path(self):
     model = RandomForestClassifier(max_depth=2, n_estimators=2)
     X, y = make_classification(10, n_features=4, random_state=42)
     X = X[:, :2].astype(numpy.float32)
     model.fit(X, y)
     model_onnx = to_onnx(
         model,
         X,
         options={id(model): {
                      'decision_path': True,
                      'zipmap': False
                  }})
     sess = OnnxInference(model_onnx)
     res = sess.run({'X': X})
     pred = model.predict(X)
     self.assertEqualArray(pred, res['label'].ravel())
     prob = model.predict_proba(X)
     self.assertEqualArray(prob, res['probabilities'])
     dec = model.decision_path(X)
     exp = binary_array_to_string(dec[0].todense())
     got = numpy.array([''.join(row) for row in res['decision_path']])
     self.assertEqual(exp, got.tolist())
Exemple #5
0
 def test_randomforestclassifier_decision_path(self):
     model = RandomForestClassifier(max_depth=2, n_estimators=2)
     X, y = make_classification(10, n_features=4, random_state=42)
     X = X[:, :2]
     model.fit(X, y)
     initial_types = [('input', FloatTensorType((None, X.shape[1])))]
     model_onnx = convert_sklearn(
         model,
         initial_types=initial_types,
         options={id(model): {
                      'decision_path': True,
                      'zipmap': False
                  }})
     sess = InferenceSession(model_onnx.SerializeToString())
     res = sess.run(None, {'input': X.astype(numpy.float32)})
     pred = model.predict(X)
     assert_almost_equal(pred, res[0].ravel())
     prob = model.predict_proba(X)
     assert_almost_equal(prob, res[1])
     dec = model.decision_path(X)
     exp = binary_array_to_string(dec[0].todense())
     got = numpy.array([''.join(row) for row in res[2]])
     assert exp == got.ravel().tolist()
n_outputs = clf.n_outputs_  # the number of outputs when the model is built

importance = clf.feature_importances_  # an array containing the fractional importance of each feature

oob_score = clf.oob_score_  # score the training dataset using an out-of-bag estimator, this computes the average of correct classifications
# basically the coefficent of determination of R**2 using 'unseen' data not used to build the model

oob_decision_func = clf.oob_decision_function_

# now looking at the methods
leaf_indicies = clf.apply(
    x_test
)  # Using apply - which says which end leaf each row in x_test ends in

# using decision_path -
indicator, n_nodes_ptr = clf.decision_path(x_test)

parameters = clf.get_params()

predicted_array = clf.predict(x_test)  # running the test set through the model

log_mean_predicted_class = clf.predict_log_proba(x_test)

mean_predicted_class = clf.predict_proba(x_test)

mean_accuracy = clf.score(
    x_test, y_test
)  # returns the accuracy (coefficent of determination of R**2) of the predicted test data outputs and the true values of the test data

# calcuate the R^2 of the model on the training set
r_2_train = clf.score(x_train, y_train)
labels = folhas.pop('Class').values
images = folhas.values

rf = RandomForestClassifier(n_estimators=25)
fit = rf.fit(images, labels)
predict = cross_val_predict(rf, images, labels, cv=10)
score = cross_val_score(rf, images, labels, cv=10)
prob = rf.predict_proba(images)

print("\n\n\n\n\nLeaf Values:\n")
print(folhas.head(340))
print("\n\nProbability:\n")
print(prob)
print("\n\nCross validation score:\n")
print(score)
print("\n\nDecision path:\n")
print(rf.decision_path(images))
print("\n\nAccuracy:\n")
print(accuracy_score(labels, predict))

feat_importances = pd.Series(rf.feature_importances_, index=folhas.columns)
feat_importances.nlargest(14).plot(kind='barh',
                                   title='Importancia dos Atributos/Features')
cm = confusion_matrix(labels, predict)
plt.matshow(cm)
plt.ylabel('X')
plt.xlabel('Y')
plt.title('MATRIZ DE CONFUSAO')
plt.colorbar()
plt.show()
def test_drf_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.RandomForestClassifier

    #Run h2o4gpu version of RandomForest Regression
    drf = Solver(backend=backend, random_state=1234, oob_score=True)
    print("h2o4gpu fit()")
    drf.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import RandomForestClassifier
    drf_sk = RandomForestClassifier(random_state=1234,
                                    oob_score=True,
                                    max_depth=3)
    print("Scikit fit()")
    drf_sk.fit(X, y)

    if backend == "sklearn":
        assert (drf.predict(X) == drf_sk.predict(X)).all() == True
        assert (drf.predict_log_proba(X) == drf_sk.predict_log_proba(X)
                ).all() == True
        assert (drf.predict_proba(X) == drf_sk.predict_proba(X)).all() == True
        assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True
        assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]
                ).all() == True
        assert (drf.apply(X) == drf_sk.apply(X)).all() == True

        print("Estimators")
        print(drf.estimators_)
        print(drf_sk.estimators_)

        print("n_features")
        print(drf.n_features_)
        print(drf_sk.n_features_)
        assert drf.n_features_ == drf_sk.n_features_

        print("n_classes_")
        print(drf.n_classes_)
        print(drf_sk.n_classes_)
        assert drf.n_classes_ == drf_sk.n_classes_

        print("n_features")
        print(drf.classes_)
        print(drf_sk.classes_)
        assert (drf.classes_ == drf_sk.classes_).all() == True

        print("n_outputs")
        print(drf.n_outputs_)
        print(drf_sk.n_outputs_)
        assert drf.n_outputs_ == drf_sk.n_outputs_

        print("Feature importance")
        print(drf.feature_importances_)
        print(drf_sk.feature_importances_)
        assert (drf.feature_importances_ == drf_sk.feature_importances_
                ).all() == True

        print("oob_score")
        print(drf.oob_score_)
        print(drf_sk.oob_score_)
        assert drf.oob_score_ == drf_sk.oob_score_
Exemple #9
0
        ],
    ]
    with open('dataset/gcc_bash.pickle', 'rb') as f:
        gcc_bash = pickle.load(f)
    with open('dataset/clang_bash.pickle', 'rb') as f:
        clang_bash = pickle.load(f)
    with open('dataset/tcc_bash.pickle', 'rb') as f:
        tcc_bash = pickle.load(f)
    # grep_clang_tcc_features, grep_clang_tcc_labels, _ = prepare_pos(clang_grep, tcc_grep)
    bash_gcc_clang_features, bash_gcc_clang_labels, bash_gcc_clang_names = prepare_pos(
        gcc_bash, clang_bash)
    bash_tcc_clang_features, bash_tcc_clang_labels, bash_tcc_clang_names = prepare_pos(
        tcc_bash, clang_bash)

    rfc = RandomForestClassifier(random_state=42)
    train_features, train_labels = prepare_training_data(training_data_files)
    rfc.fit(train_features, train_labels)
    paths, _ = rfc.decision_path(bash_gcc_clang_features)

    base, _ = rfc.decision_path([bash_tcc_clang_features[10]])
    sims = [(cos_sim(path, base), i) for i, path in enumerate(paths)]
    sims = sorted(sims, key=lambda x: x[0], reverse=True)
    for sim, i in sims[:3]:
        print(bash_gcc_clang_names[i])
        print(sim)
    for sim, i in sims[-3:]:
        print(bash_gcc_clang_names[i])
        print(sim)
    print(bash_gcc_clang_names[sims[0][1]])
    print(bash_tcc_clang_names[10])
Exemple #10
0
clf = RandomForestClassifier(max_features='sqrt',
                             n_jobs=2,
                             random_state=RANDOM_STATE)
clf.fit(train_data, train_label)

print('Read testing data...')
with open('testing.csv', 'r') as reader:
    test_data = []
    for line in reader.readlines():
        pixels = list(map(float, line.rstrip().split(',')))
        test_data.append(pixels)
print('Loaded ' + str(len(test_data)))

print('Predicting...')
test_data = np.array(test_data)
decision_path = clf.decision_path(test_data)
feature_label = []
for i in range(3088):
    feature_label.append(str(i))

# understand tree structure
# ref 1: http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
# ref 2: https://stackoverflow.com/questions/40155128/plot-trees-for-a-random-forest-in-python-with-scikit-learn
# parse tree structure and save the info by txt, png, and dot

f = open('rf_tree_explanation.txt', 'a')
for treeIndex in range(len(clf.estimators_)):
    f.write('Tree number %d\n' % (treeIndex + 1))
    n_nodes = clf.estimators_[treeIndex].tree_.node_count
    children_left = clf.estimators_[treeIndex].tree_.children_left
    children_right = clf.estimators_[treeIndex].tree_.children_right
    #data for both classification and regression
    X_train = np.random.rand(n_samples, 10)
    y_train = np.random.randint(num_classes, size=(n_samples))

    if (predicttype == 'classify'):
        forest = RandomForestClassifier(n_estimators=n_trees, oob_score=True)
    else:
        forest = RandomForestRegressor(n_estimators=n_trees, oob_score=True)

    oob_indices, oob_leaves_id, OOB_tree_indicator = {}, {}, {}
    #fit
    forest.fit(X_train, y_train)
    forest_oob_score = forest.oob_score_

    n_trees, train_size = forest.n_estimators, len(y_train)
    indicator, n_nodes_ptr = forest.decision_path(X_train)
    node_indicator = {}
    sample_index = {}
    for t, estimator in enumerate(forest):
        oob_indices[t] = _generate_unsampled_indices(estimator.random_state,
                                                     X_train.shape[0])
        oob_leaves_id[t] = estimator.apply(X_train[oob_indices[t], :])
        sample_index[t] = _generate_sample_indices(estimator.random_state,
                                                   n_samples)
        node_indicator[t] = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]]
    mean_vals = {}
    for t in range(n_trees):
        mean_vals[t] = np.zeros(node_indicator[t].shape[1])
        for node in range(node_indicator[t].shape[1]):
            r, c = node_indicator[t][:, node].nonzero()
            mean_vals[t][node] = np.mean(y_train[sample_index[t]][r])
Exemple #12
0
class WaveRandomForestClassifier(BaseEstimator, ClassifierMixin):
    """
    RandomForest based classifier but with nodes that are removed
    
    See Paper:
    Wavelet decomposition of Random Forests
    http://www.jmlr.org/papers/volume17/15-203/15-203.pdf
    """
    def __init__(
        self,
        n_estimators=100,
        criterion="gini",
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features="auto",
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        bootstrap=True,
        oob_score=False,
        n_jobs=1,
        random_state=None,
        verbose=0,
        warm_start=False,
        class_weight=None,
        nodes_to_keep=0.9,
    ):

        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        self.bootstrap = bootstrap
        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.warm_start = warm_start
        self.class_weight = class_weight

        self.nodes_to_keep = nodes_to_keep

        self.forest = None

    def fit(self, X, y):

        # 1) create RandomForest
        self.forest = RandomForestClassifier(
            n_estimators=self.n_estimators,
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            bootstrap=self.bootstrap,
            oob_score=self.oob_score,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            verbose=self.verbose,
            warm_start=self.warm_start,
            class_weight=self.class_weight,
        )

        # 2) fit it
        self.forest.fit(X, y)

        self.n_outputs_ = self.forest.n_outputs_

        # 3) retrieve node norms and values
        self.nodes_norm, self.nodes_value = compute_node_norm_classification_forest(
            self.forest)

        # 4) filter nodes
        self._nodes_order = np.argsort(-self.nodes_norm)

        if self.nodes_to_keep is not None:
            if self.nodes_to_keep < 1:
                nodes_to_keep = int(
                    len(self._nodes_order) * self.nodes_to_keep)
            else:
                nodes_to_keep = int(self.nodes_to_keep)

            self._ind_nodes_to_keep = self._nodes_order[:nodes_to_keep]
        else:
            self._ind_nodes_to_keep = None

        return self

    def _set_nodes_to_keep(self, nodes_to_keep):
        """ change the number of waweletts to keep withtout refitting the underlying random forest """
        self.nodes_to_keep = nodes_to_keep

        if self.forest is not None:

            if self.nodes_to_keep is None:
                self._ind_nodes_to_keep = None

            else:
                if self.nodes_to_keep < 1:
                    nodes_to_keep = int(
                        len(self._nodes_order) * self.nodes_to_keep)
                else:
                    nodes_to_keep = int(self.nodes_to_keep)

            self._ind_nodes_to_keep = self._nodes_order[:nodes_to_keep]

    def predict_proba(self, X):

        if self.forest is None:
            raise NotFittedError("You should fit the model first")

        path, _ = self.forest.decision_path(X)

        if self._ind_nodes_to_keep is not None:
            predict_proba_filtered = [
                path[:, self._ind_nodes_to_keep].dot(
                    self.nodes_value[self._ind_nodes_to_keep, n, :])
                for n in range(self.nodes_value.shape[1])
            ]
        else:
            predict_proba_filtered = [
                path[:, :].dot(self.nodes_value[:, n, :])
                for n in range(self.nodes_value.shape[1])
            ]

        for p in predict_proba_filtered:
            p[p < 0] = 0
            p[p > 1] = 1

        if len(predict_proba_filtered) == 1:
            return predict_proba_filtered[0]
        else:
            return predict_proba_filtered

    @property
    def classes_(self):
        return self.forest.classes_

    def predict(self, X):
        """Predict class for X.

        The predicted class of an input sample is a vote by the trees in
        the forest, weighted by their probability estimates. That is,
        the predicted class is the one with highest mean probability
        estimate across the trees.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted classes.
        """
        # Copied from base forest
        proba = self.predict_proba(X)

        if self.n_outputs_ == 1:
            return self.classes_.take(np.argmax(proba, axis=1), axis=0)

        else:
            n_samples = proba[0].shape[0]
            predictions = np.zeros((n_samples, self.n_outputs_))

            for k in range(self.n_outputs_):
                predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
                                                                    axis=1),
                                                          axis=0)

            return predictions

    def predict_log_proba(self, X):
        """Predict class log-probabilities for X.

        The predicted class log-probabilities of an input sample is computed as
        the log of the mean predicted class probabilities of the trees in the
        forest.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        # Copied from base forest
        proba = self.predict_proba(X)

        if self.n_outputs_ == 1:
            return np.log(proba)

        else:
            for k in range(self.n_outputs_):
                proba[k] = np.log(proba[k])

            return proba
Exemple #13
0
print X_test.head()

rf = RandomForestClassifier(n_estimators=30,
                            max_depth=None,
                            min_samples_split=10,
                            class_weight="balanced"
                            #min_weight_fraction_leaf=0.02
                            )
rf.fit(X_train, y_train)
print("\n\n ---Random Forest Model---")
rf_roc_auc = roc_auc_score(y_test, rf.predict(X_test))
print("Random Forest AUC = %2.2f" % rf_roc_auc)

rf_accuracy = accuracy_score(y_test, rf.predict(X_test))
print("Random Forest Accuracy= %2.2f" % rf_accuracy)

tn, fp, fn, tp = confusion_matrix(y_test, rf.predict(X_test)).ravel()
print "True -ve:", tn
print "True +ve:", tp
print "False -ve:", fn
print "False +ve:", fp
print(classification_report(y_test, rf.predict(X_test)))
print rf.predict_proba(X_test)
print confusion_matrix(y_test, rf.predict(X_test))
print len(rf.decision_path(X_test)[1])

from sklearn.externals import joblib

# save model
joblib.dump(rf, 'model.pkl')
Exemple #14
0
        combined_score = []
        
        #function for returning for slicing condition
        def condition_combination(index_vals, df=x):
            condition = True
            for indx in index_vals:
                condition = condition & (df[features[indices[indx]]] >= min(feature_thresholds[indices[indx]][0])) & (df[features[indices[indx]]] <= max(feature_thresholds[indices[indx]][1]))
            return condition

        print("Feature ranking " + name + ":")
        
        if(i < 3):
            p = clf.predict(x)
            temp = x[(p == 1) & (p == y)].copy()
            temp = temp.to_numpy()
            for n, row in enumerate(clf.decision_path(temp).toarray()):
                for indx in np.nonzero(row)[-1][-2:-1]:
                    if(temp[n,clf.tree_.feature[indx]] <= clf.tree_.threshold[indx]):
                        feature_thresholds[clf.tree_.feature[indx]][1].append(clf.tree_.threshold[indx])
                    else:
                        feature_thresholds[clf.tree_.feature[indx]][0].append(clf.tree_.threshold[indx])
        else:
            for tree in clf.estimators_:
                p = tree.predict(x)
                temp = x[(p == 1) & (p == y)].copy()
                temp = temp.to_numpy()
                for n, row in enumerate(tree.decision_path(temp).toarray()):
                    for indx in np.nonzero(row)[-1][-2:-1]:
                        if(temp[n,tree.tree_.feature[indx]] <= tree.tree_.threshold[indx]):
                            feature_thresholds[tree.tree_.feature[indx]][1].append(tree.tree_.threshold[indx])
                        else:    
Exemple #15
0
    print('Wrong value of codebook')





#fit the forest
forest = RandomForestClassifier(n_estimators=100, max_depth = 20, min_samples_split = 2,max_features = None, criterion = 'entropy', n_jobs = -1).fit(data_tr, labels)
#make predictions
hs = forest.predict(data_te)
#whats the accuracy
(forest.score(data_te, labels))*100
#how many matches is that
sum(labels == hs)

forest.decision_path(data_tr)

from IPython.display import display, Image
import pydotplus
import sklearn.tree as tree


for dtree in forest.estimators_:
    dot_data = tree.export_graphviz(dtree
                                    , out_file = None
                                    , filled   = True
                                    , rounded  = True
                                    , special_characters = True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    img = Image(graph.create_png())
    display(img)
Exemple #16
0
def Classifier_random_forest(Xfeat_test, Xfeat,y_each_patient_test, y_each_patient, selected_babies, \
                              selected_test, label,classweight, Used_classifier, drawing, lst, ChoosenKind,\
                              SamplingMeth,probability_threshold,ASprobLimit,N,crit,msl,deciding_performance_measure,dispinfo):

    #### CREATING THE sampleweight FOR SELECTED BABIES
    #### TRAIN CLASSIFIER
    meanaccLOO = []
    accLOO = []
    testsubject = []
    tpr_mean = []
    counter = 0
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    F1_macro_collect = []
    F1_micro_collect = []
    F1_weight_collect = []
    F1_all_collect = []
    K_collect = []
    preliminaryK = zeros(len(probthres_Grid))

    #CREATING TEST AND TRAIN SETS
    Selected_training = selected_babies
    X_train = [
        Xfeat[np.where(np.array(selected_babies) == k)[0][0]]
        for k in Selected_training
    ]  # combine only babies to train on in list
    y_train = [
        y_each_patient[np.where(np.array(selected_babies) == k)[0][0]]
        for k in Selected_training
    ]
    X_test = Xfeat_test[selected_test]
    y_test = y_each_patient_test[selected_test]
    X_train = vstack(
        X_train)  # mergin the data from each list element into one matrix
    y_train = vstack(y_train)
    y_old = y_train[:]

    #SAMPLING TO EQUALIZE CLASS IMBALANCE

    X_train, y_train = cmplx_Oversampling(X_train, y_train, ChoosenKind,
                                          SamplingMeth, label)

    #CALCULATE THE WEIGHTS DUE TO CLASS IMBALANCE
    class_weight = 'balanced'
    classlabels = ravel(
        y_old)  # y_test has to be a 1d array for compute_class_weight
    # Now test if all labels are actually in the data. Otheriwse error with compute_class_weight. If not make the found labels the newe labels. If the new label is 1 then classsification does not work, therefore skip class_weigth , therefore CW
    if (classweight == 1) and len(unique(classlabels)) == len(label):
        cW = compute_class_weight(class_weight, label, classlabels)
        cWdict = dict(
            zip(label, cW)
        )  #the class weight need to be a dictionarry of the form:{class_label : value}
        CW = 1
    elif (classweight == 1) and len(unique(classlabels)) != len(label):
        CW_label = unique(classlabels)  #which values arein an array
        if len(CW_label) == 1:
            if dispinfo:
                print('classweight config skiped once as only one class exist')
            CW = 0
        else:
            if dispinfo:
                print('used labels are:', CW_label, 'instead of:', label)
            cW = compute_class_weight(class_weight, CW_label, classlabels)
            cWdict = dict(
                zip(label, cW)
            )  #the class weight need to be a dictionarry of the form:{class_label : value}
            CW = 1
    if dispinfo:
        disp(cWdict)

#The Random Forest / Extreme Random Forest / Gradiant boosting

    if Used_classifier == 'TR':
        if (classweight == 1) and CW == 1:
            clf = tree.DecisionTreeClassifier(criterion=crit, splitter="best", max_depth=None,\
                                              min_samples_split=2, min_samples_leaf=msl, \
                                              min_weight_fraction_leaf=0.0, max_features=None, \
                                              random_state=42, max_leaf_nodes=None, min_impurity_decrease=0.0,\
                                              min_impurity_split=None, class_weight=cWdict, presort=False)

        else:
            clf = tree.DecisionTreeClassifier(criterion=crit, splitter="best", max_depth=None,\
                                              min_samples_split=2, min_samples_leaf=msl, \
                                              min_weight_fraction_leaf=0.0, max_features=None, \
                                              random_state=42, max_leaf_nodes=None, min_impurity_decrease=0.0,\
                                              min_impurity_split=None,  presort=False)

    if Used_classifier == 'RF':
        if (classweight == 1) and CW == 1:
            clf = RandomForestClassifier(n_estimators=N, criterion=crit, max_depth=None, \
                                         min_samples_split=2, min_samples_leaf=msl, min_weight_fraction_leaf=0.0,\
                                         max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\
                                         min_impurity_split=None, bootstrap=True, oob_score=False,\
                                         n_jobs=1, random_state=42, verbose=0, warm_start=False,\
                                         class_weight=cWdict)

        else:
            clf = RandomForestClassifier(n_estimators=N, criterion=crit, max_depth=None, \
                                         min_samples_split=2, min_samples_leaf=msl, min_weight_fraction_leaf=0.0,\
                                         max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\
                                         min_impurity_split=None, bootstrap=True, oob_score=False,\
                                         n_jobs=1, random_state=42, verbose=0, warm_start=False,\
                                         )
    elif Used_classifier == 'ERF':
        if (classweight == 1) and CW == 1:
            clf = ExtraTreesClassifier(n_estimators=N, criterion=crit, max_depth=None,\
                                         min_samples_split=2, min_samples_leaf=msl, \
                                         max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\
                                         min_impurity_split=None, bootstrap=True, oob_score=False,\
                                         n_jobs=1, random_state=42, verbose=0, warm_start=False,\
                                         class_weight=cWdict)
        else:
            clf = ExtraTreesClassifier(n_estimators=N, criterion=crit, max_depth=None,\
                                         min_samples_split=2, min_samples_leaf=msl, min_weight_fraction_leaf=0.0,\
                                         max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\
                                         min_impurity_split=None, bootstrap=True, oob_score=False,\
                                         n_jobs=1, random_state=42, verbose=0, warm_start=False,\
                                         )
    elif Used_classifier == 'GB':
        clf = GradientBoostingClassifier(loss="deviance", learning_rate=0.1, n_estimators=1000, subsample=1, \
                            criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,\
                            max_depth=30, min_impurity_decrease=0.0, min_impurity_split=None, init=None, \
                            random_state=42, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

    elif Used_classifier == 'LR':
        clf = LinearRegression(fit_intercept=True,
                               normalize=False,
                               copy_X=True,
                               n_jobs=1)

#Performance analysis
#        sys.exit('Jan werth')
    if len(label) < 2:
        print("please use at least two labels")

# F1 Kappa
    else:
        prediction = clf.fit(X_train, y_train.ravel()).predict(
            X_test)  # prediction decide on 0.5 proability which class to take
        probs = (
            clf.fit(X_train, y_train.ravel()).predict_proba(X_test)
        )  # with the calculated probabilities we can choose our own threshold
        if probability_threshold:  # prediction takes always proability 0.5 to deside. Here we deside based on other way lower proabilities. deciding if any other than AS has slightly elevated probabilities
            for k in range(len(probthres_Grid)
                           ):  # try differnt Trhesholds for the probability
                preliminary_pred = copy(prediction[:])
                probthres = probthres_Grid[k]
                for i in range(len(probs)):
                    if len(label) == 3:
                        if any(
                                probs[i, 1:] >= probthres
                        ) and probs[i, 0] < ASprobLimit[
                                0]:  #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS
                            highprob = np.argmax(
                                probs[i, 1:]
                            )  # otherwise search for max prob of the labels other than AS
                            preliminary_pred[i] = label[
                                highprob +
                                1]  # change the label in predictions to the new found label; +1 as we cut the array before by 1. Otherwise false index
                    if len(label) > 3:
                        if any(
                                probs[i, 1:] >= probthres
                        ) and probs[i, 0] < ASprobLimit[
                                1]:  #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS
                            highprob = np.argmax(
                                probs[i, 1:]
                            )  # otherwise search for max prob of the labels other than AS
                            preliminary_pred[i] = label[
                                highprob +
                                1]  # change the label in predictions to the new found label; +1 as we cut the array before by 1. Otherwise false index

                    elif (
                            probs[i, 1]
                    ) >= probthres:  # if we have only two labels searching for max does not work
                        preliminary_pred[i] = label[
                            1]  # CHange the label in prediction to the second label
#!!!!!!!! To change klassifier for perfomance measure
                if deciding_performance_measure == 'Kappa':
                    preliminaryK[k] = cohen_kappa_score(
                        y_test.ravel(), preliminary_pred,
                        labels=label)  # Find the threshold where Kapa gets max
                elif deciding_performance_measure == 'F1_second_label':
                    preliminaryK[k] = f1_score(y_test.ravel(),
                                               preliminary_pred,
                                               labels=label,
                                               average=None)[1]
                elif deciding_performance_measure == 'F1_third_label':
                    preliminaryK[k] = f1_score(y_test.ravel(),
                                               preliminary_pred,
                                               labels=label,
                                               average=None)[2]
                elif deciding_performance_measure == 'F1_fourth_label':
                    preliminaryK[k] = f1_score(y_test.ravel(),
                                               preliminary_pred,
                                               labels=label,
                                               average=None)[3]
#!!!!!!!! To change klassifier for perfomance measure
            maxK = preliminaryK.argmax(axis=0)
            if dispinfo:
                print('Used probability Thresh: %.2f' % probthres_Grid[maxK])
            probthres = probthres_Grid[
                maxK]  #repeat creating the predictions with the optimal probabilty threshold
            for i in range(len(probs)):
                if len(label) == 3:
                    if any(
                            probs[i, 1:] >= probthres
                    ) and probs[i, 0] < ASprobLimit[
                            0]:  #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS
                        highprob = np.argmax(
                            probs[i, 1:]
                        )  # otherwise search for max prob of the labels other than AS
                        prediction[i] = label[
                            highprob +
                            1]  # change the label in predictions to the new found label; +1 as we cut the array before by 1. Otherwise false index
                if len(label) > 3:
                    if any(
                            probs[i, 1:] >= probthres
                    ) and probs[i, 0] < ASprobLimit[
                            1]:  #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS
                        highprob = np.argmax(
                            probs[i, 1:]
                        )  # otherwise search for max prob of the labels other than AS
                        prediction[i] = label[highprob + 1]  # cha
                elif (
                        probs[i, 1]
                ) >= probthres:  # if we have only two labels searching for max does not work
                    prediction[i] = label[
                        1]  # CHange the label in prediction to the second label

        scoring = clf.score(X_test, y_test.ravel(), sample_weight=None)
        Fimportances = clf.feature_importances_
        if Used_classifier != 'GB':
            Dpath = clf.decision_path(X_train)

        resultsF1_macro = f1_score(y_test.ravel(), prediction,
                                   average='macro')  #, pos_label=None)
        resultsF1_micro = f1_score(y_test.ravel(), prediction, average='micro')
        resultsF1_weight = f1_score(y_test.ravel(),
                                    prediction,
                                    average='weighted')
        resultsF1_all = f1_score(y_test.ravel(),
                                 prediction,
                                 labels=label,
                                 average=None)  #, pos_label=None)

        resultsK = cohen_kappa_score(y_test.ravel(), prediction, labels=label)

        if drawing and Used_classifier == 'TR':
            import graphviz
            from Loading_5min_mat_files_cECG import Class_dict, features_dict
            usedfeatures = list(
                (features_dict[k]) for k in lst
            )  #create a dict only with the usedfeatures in lst out of all which are in features_dict
            usedlabels = list(
                (Class_dict[k]) for k in label
            )  #create a dict only with the usedfeatures in lst out of all which are in features_dict

            with open("RF.txt", "w") as f:
                f = tree.export_graphviz(clf,
                                         out_file=f,
                                         feature_names=usedfeatures,
                                         class_names=usedlabels,
                                         filled=True,
                                         rounded=True)
#               with open("RF.dot", "w") as f:
#                      f = tree.export_graphviz(clf, out_file=f)
            with open("RF.svc", "w") as f:
                f = tree.export_graphviz(clf, out_file=f)
#               dot -Tpdf RF.dot -o RF.pdf
#               open -a preview RF.pdf

#               dot_data = tree.export_graphviz(clf, out_file=None)
#               graph = graphviz.Source(dot_data)
#               graph.render("Jan")
#
#               dot_data = tree.export_graphviz(clf, out_file=None,
#                                        feature_names=usedfeatures,
#                                        class_names=usedlabels,
#                                        filled=True, rounded=True,
#                                        special_characters=True)
#               graph = graphviz.Source(dot_data)
#               graph
#

    return resultsF1_macro, resultsK, resultsF1_micro, resultsF1_weight, resultsF1_all, Fimportances, scoring, prediction, probs
Exemple #17
0
print(ypred)
print(list(le.inverse_transform(ypred)))
print(lgb1.predict_proba(xtest))
print(accuracy_score(ytest,ypred))
print(accuracy_score(ytrain,ypred1))
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
rf=RandomForestClassifier(random_state=0,class_weight="balanced")
print(rf.fit(xtrain,ytrain))
ypred=(rf.predict(xtest))
ypred1=(rf.predict(xtrain))
print(ypred)
print(list(le.inverse_transform(ypred)))
print(classification_report(ypred,ytest))
print(rf.apply(xtest))
print(rf.decision_path(xtest))
print(rf.predict_proba(xtest))
print(rf.predict_log_proba(xtest))
print(rf.score(xtrain,ytrain))
rmse=math.sqrt(mean_squared_error(ytest,ypred))
print(rmse)
print(r2_score(ytest,ypred))
confusionmatrix=confusion_matrix(ypred,ytest)
print(confusionmatrix)
print(accuracy_score(ytest,ypred))
print(accuracy_score(ytrain,ypred1))
from sklearn.svm import SVC
svc=SVC(kernel="rbf",random_state=0,gamma=1,C=1,class_weight="balanced")
print(svc.fit(xtrain,ytrain))
ypred=svc.predict(xtest)
ypred1=svc.predict(xtrain)
Exemple #18
0
print(trainframe['label'])
trainframe['label'] = trainframe['label'].astype('int')
testframe['label'] = testframe['label'].astype('int')
print(testframe['label'])
with open('actualpredictionsRF', 'wb') as fp:
    pickle.dump(testframe['label'], fp)
print("completed encoding labels")
# #clf = RandomForestClassifier()
#clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
clf = clf.fit(matrix, trainframe['label'])
#clf = clf.fit(matrix, trainframe['label'])
#predictions = clf.predict(matrix2)
#pickle to save the data
print(clf.decision_path(matrix))
filename = 'randomforestFinal'
pickle.dump(clf, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
predictions = loaded_model.predict(matrix2)
'''
filename1='RandomForestpredict'
pickle.dump(predictions,open(filename1, 'wb'))
'''
#print(clf.feature_importances_)
#predictions = clf.predict(matrix2)
ts2 = time.time()
ts2 = ts2 - ts
print("time taken")
print(ts2)
Exemple #19
0
    'oob_score': [True, False],
    'random_state': [None, 2017]
}
RF_GS = RandomForestClassifier()
clf_GS_RF = GridSearchCV(RF_GS, RFparameters)
clf_GS_RF.fit(x_train_tfidf, y_train)
sorted(clf_GS_RF.cv_results_.keys())
GridPredRF = clf_GS_RF.predict(x_test_tfidf)
GS_RF_accuracy = metrics.accuracy_score(y_test, GridPredBag)
print('Accuracy score for clf_GS_RF: ', GS_RF_accuracy)
print('Best params for clf_GS_RF: ', clf_GS_RF.best_params_)
print('Detailed clf_GS_RF results: ', clf_GS_RF.cv_results_)
#print ('OOB score from clf_GS_RF : ', clf_GS_RF.oob_score_ )
#print ('OOB decision function from clf_GS_RF : ', RF_GS.oob_decision_function_ )
print('Feature importances from clf_GS_RF : ', RF_GS.feature_importances_)
RF_GS.decision_path(x_train_tfidf)
print(clf_GS_RF.best_params_)
print(clf_GS_RF.best_score_)
print(clf_GS_RF.grid_scores_)
#clf_GS_RF.error_score
print(metrics.confusion_matrix(y_test, GridPredRF))
#plt.plot( , clf_GS_RF)

for e in df['param_max_features'].value_counts().index:
    print(e)
    estimators = df.param_n_estimators[(df.param_criterion == 'entropy')
                                       & (df.param_oob_score == True)]
    mean_score = df.mean_test_score[(df.param_criterion == 'entropy')
                                    & (df.param_oob_score == True)]
    plt.plot(estimators[df['param_max_features'] == e],
             mean_score[df['param_max_features'] == e],
 plt.show()
 #plt.plot()
 
 y = train_data["Survived"]
 
 #PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
 features = ["Sex", "Age", "Pclass", "Embarked", "SibSp", "Parch"]
 X = pd.get_dummies(train_data[features])
 X_test = pd.get_dummies(test_data[features])
 
 model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=1)
 model.fit(X, y)
 predictions = model.predict(X_test)
 
 print(model.feature_importances_)
 print(model.decision_path(X_test))
 
 from sklearn.tree import export_graphviz
 estimator = model.estimators_[5]
 # Export as dot file
 export_graphviz(estimator, out_file='tree.dot', 
                 feature_names = ["Sex", "Age", "Pclass", "Embarked", "SibSp", "Parch"],
                 class_names = "Survived",
                 rounded = True, proportion = False, 
                 precision = 2, filled = True)
 
 # Convert to png using system command (requires Graphviz)
 from subprocess import call
 #process = subprocess.Popen(command, stdout=tempFile, shell=True)
 #call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
 #set path=%path%;C:\Anaconda3\graphviz-2.38\release\bin
    clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)
    print('RF Model fitted with' + repr(40) +
          'features\n RF feature importances\n')
    important_features = pd.Series(data=clf.feature_importances_,
                                   index=fdataDF.columns)
    feats = {}  # a dict to hold feature_name: feature_importance
    for feature, importance in zip(fdataDF.columns, clf.feature_importances_):
        feats[feature] = importance  #add the name/value pair

    importances = pd.DataFrame.from_dict(
        feats, orient='index').rename(columns={0: 'Gini-importance'})
    importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90)
    plt.rcParams.update({'font.size': 5})
    plt.show()
    print('RF decision path')
    pprint.pprint(clf.decision_path(X_train))
    y_pred = clf.predict(X_test)
    print('RF Score')
    print(clf.score(X_test, y_test))
    print('RMSE\n')
    print(np.sqrt(metrics.mean_squared_error(y_pred, y_test)))
    # computing permutation importance of each feature in dataset
    perm = PermutationImportance(clf, random_state=1).fit(X_train, y_train)

    # # create a structured array
    # dtype = [('feature', str), ('permutation_weights', float)]
    features = np.array(X_train.columns.to_list())
    permF = np.array([X_train.columns.to_list, perm.feature_importances_])
    rankedFeatureIds = perm.feature_importances_.argsort(
    )[::
      -1]  #[::-1] reverses the ascending result of argsort, indices of arrays sorted
Exemple #22
0
    score = clf.score(X_test, y_test)
    if score > current_best_score:
        current_best_score = score
        best_tree_shuffle = clf
    print(current_best_score)
#%%
res = clf.predict(X_test)
compteur = 0
nb_nuls_predis = 0
for i in range(len(X_test)):
    #print(res[i]==labels_test[i])
    if res[i] == y_test[i]:
        compteur += 1
    #print('\n')
print(str(compteur * 100 / len(X_test)) + "%")

#%%
plot_confusion_matrix(clf, X_test, y_test)

#%%
clf.predict_proba(X_test)
clf.decision_path(X_test)
print(clf.decision_path(X_test))
clf.score(X_test, y_test)
#%% ne pas écraser foretV1 !!!
#from joblib import dump, load
#dump(clf, 'foretV2.joblib')
#%%
clf = load('foretV1.joblib')
#%%
clf.score(X_test, y_test)
Exemple #23
0

#########################################
# Negative figures. We still have raw scores.

#######################################
# Option *decision_path*
# ++++++++++++++++++++++
#
# *scikit-learn* implements a function to retrieve the
# decision path. It can be enabled by option *decision_path*.

clrrf = RandomForestClassifier(n_estimators=2, max_depth=2)
clrrf.fit(X_train, y_train)
clrrf.predict(X_test[:2])
paths, n_nodes_ptr = clrrf.decision_path(X_test[:2])
print(paths.todense())

model_def = to_onnx(clrrf, X_train.astype(numpy.float32),
                    options={id(clrrf): {'decision_path': True,
                                         'zipmap': False}})
sess = InferenceSession(model_def.SerializeToString())

##########################################
# The model produces 3 outputs.

print([o.name for o in sess.get_outputs()])

##########################################
# Let's display the last one.
Exemple #24
0
# In[536]:

# Implies 300 Trees Generates the Highest Accuracy
plt.plot(xlabels, n_trees)
plt.xlabel('Trees in RandomForest')
plt.ylabel('Accuracy')
plt.title("RandomForest Optimization")


# #RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, #min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, #oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)

# In[514]:

# Why does this decision path look like its got more features than 5 ?
print(eeg_rf.score(test_x, test_y)*100)
eeg_rf.decision_path(test_x)


# ## Ensemble Classifiers

# In[64]:

# Voting Classifier ... Accuracy lower than RandomForest (with 500 trees) ...possible ?
voting = VotingClassifier(estimators=[('lr', lr_eeg), ('rf', eeg_rf), ('gnb', clf)], voting='hard')
voting = voting.fit(train_x_masked, train_y_masked)
print("Confusion Matrix ")
print("Classif: 0    1")
print(confusion_matrix(y_true=test_y_masked, y_pred=voting.predict(test_x_masked)))
print()
print("Voting Classifier Accuracy: ")
print((1-(902+289)/(666+902+289+1346))*100)
#To set the working directory
os.chdir("/Users/steven/Documents/dataMining/Kaggle/digitRecognizer")
cwd = os.getcwd()

# get titanic & test csv files as a DataFrame
digit_train_df = pd.read_csv("input/train.csv")
digit_test_df = pd.read_csv("input/test.csv")

# preview the data
digit_train_df.head()
digit_test_df.head()

#defining the training data set
X_train = digit_train_df.drop("label", axis=1)
Y_train = digit_train_df["label"]
X_test = digit_test_df

random_forest = RandomForestClassifier(n_estimators=10)

random_forest.fit(X_train, Y_train)

random_forest.decision_path(X_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

submission = pd.DataFrame({"ImageId": X_test.index + 1, "Label": Y_pred})
submission.to_csv('RF_10.csv', index=False)