Python DecisionTreeClassifier.fit Examples, sklearn.tree.tree.DecisionTreeClassifier.fit Python Examples

Example #1

0

Show file

class DecisionTreeClassifierImpl():

    def __init__(self, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight='balanced', presort=False):
        self._hyperparams = {
            'criterion': criterion,
            'splitter': splitter,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'random_state': random_state,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'class_weight': class_weight,
            'presort': presort}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)

Example #2

0

Show file

File: stacked_classifier.py Project: zuphilip/Quadflor

    def fit(self, X, y):
        """
        Fit the NearestCentroid model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        """

        self.y = y
        if self.fit_base:
            self.base_classifier.fit(X, y)
        distances = self.base_classifier.predict_proba(X)

        topNIndices, topNDistances = self._get_top_labels(distances)
        training_data = self._extract_features(topNIndices, topNDistances, y,
                                               distances)

        # create a decision tree for each label
        self.meta_classifiers = {}
        for label, training_samples_of_label in training_data.items():
            training_samples_of_label = np.matrix(training_samples_of_label)
            decision_tree = DecisionTreeClassifier(criterion="gini")
            decision_tree.fit(training_samples_of_label[:, 0:-1],
                              training_samples_of_label[:, -1:])
            self.meta_classifiers[label] = decision_tree

Example #3

0

Show file

def decision_tree_depths():
    max_depths = [2, 4, 6, 8, 10, 12, 16, 18, 20, 25, 30, 40]

    columns = [
        'Max Depths', 'Training Score', 'Test Score', 'Train Time', 'Test Time'
    ]
    df = pd.DataFrame(columns=columns)

    for depth in max_depths:
        start_train = time.time()
        dt = DecisionTreeClassifier(max_depth=depth)
        print(dt)
        dt.fit(X_train, y_train)
        end_train = time.time() - start_train

        train_score = dt.score(X_train, y_train)
        start_test = time.time()
        test_score = dt.score(X_test, y_test)
        end_test = time.time() - start_test

        values = [depth, train_score, test_score, end_train, end_test]
        df.loc[len(df)] = values

        print(' '.join(str(col) for col in columns))
        print(' '.join(str(val) for val in values))

    df.to_excel('adult_dt.xls')

Example #4

0

Show file

def decision_tree_training_sets():
    training_set_sizes = [.1,.25,.5,.75,.9]

    columns = ['Training Set Size', 'Training Score', 'Test Score', 'Train Time', 'Test Time']
    df = pd.DataFrame(columns=columns)

    for training_set_size in training_set_sizes:
        X_train, X_test, y_train, y_test = train_test_split(
            encoded_data[list(set(encoded_data.columns) - set(['Target']))],
            encoded_data['Target'], train_size=training_set_size)
        scaler = preprocessing.StandardScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float32')), columns=X_train.columns)
        X_test = scaler.transform(X_test.astype('float32'))

        start_train = time.time()
        dt = DecisionTreeClassifier(max_depth=8)
        print(dt)
        dt.fit(X_train, y_train)
        end_train = time.time() - start_train

        train_score = dt.score(X_train, y_train)
        start_test = time.time()
        test_score = dt.score(X_test, y_test)
        end_test = time.time() - start_test

        values = [training_set_size, train_score, test_score, end_train, end_test]
        df.loc[len(df)] = values

        print(' '.join(str(col) for col in columns))
        print(' '.join(str(val) for val in values))

    df.to_excel('diabetes_dt_training_sets.xls')

Example #5

0

Show file

def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict):
    clf = DecisionTreeClassifier()
    clf.fit(X, y)
    current_state_to_predict = np.array(current_state_to_predict).reshape(
        1, -1)
    predicted_state = clf.predict(current_state_to_predict)
    return predicted_state

Example #6

0

Show file

File: stacked_classifier.py Project: quadflor/Quadflor

    def fit(self, X, y):
        """
        Fit the NearestCentroid model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        """

        self.y = y
        if self.fit_base:
            self.base_classifier.fit(X, y)
        distances = self.base_classifier.predict_proba(X)

        topNIndices, topNDistances = self._get_top_labels(distances)
        training_data = self._extract_features(topNIndices, topNDistances, y, distances)

        # create a decision tree for each label
        self.meta_classifiers = {}
        for label, training_samples_of_label in training_data.items():
            training_samples_of_label = np.matrix(training_samples_of_label)
            decision_tree = DecisionTreeClassifier(criterion="gini")
            decision_tree.fit(training_samples_of_label[:, 0:-1], training_samples_of_label[:, -1:])
            self.meta_classifiers[label] = decision_tree

Example #7

0

Show file

def dtree(X, y, model_path):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)

Example #8

0

Show file

def wrapper_for_decision_tree_accuracy(X, y, relative_test_size):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=relative_test_size, random_state=42)

    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = accuracy_score(pred, y_test)

    return score

Example #9

0

Show file

File: trees.py Project: TheGrimmScientist/AgileMachineLearning

def wrapper_for_decision_tree_accuracy(X, y, relative_test_size):
	X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=relative_test_size, random_state=42)


	clf = DecisionTreeClassifier()
	clf.fit(X_train, y_train)
	pred = clf.predict(X_test)
	score = accuracy_score(pred,y_test)

	return score

Example #10

0

Show file

File: feature_based.py Project: fakenewssyria/fake_news_detection

    def create_decision_tree(self):
        ''' based on experiments our best model was the decision tree model with the following params: '''

        tree = DecisionTreeClassifier(max_depth=65,
                                      min_samples_split=0.03,
                                      min_samples_leaf=3,
                                      max_features=8)
        tree.fit(self.X_train, self.Y_train)
        predicted_y = tree.predict(self.X_test)
        print(predicted_y)
        self.print_stats(predicted_y, "")
        self.test_df['learning_label'] = predicted_y
        self.test_df.to_csv('output/feature_extraction.csv',
                            encoding="latin-1")  # save the training dataset

Example #11

0

Show file

def MarginBoostClf(features, labels, max_depth, n_steps, margin):
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    clf_list = []
    for t in range(n_steps):
        clf = DecisionTreeClassifier(max_depth=max_depth)
        clf = clf.fit(features, labels, sample_weight=weights)
        y_predict = clf.predict(features)
        incorrect = y_predict != labels
        # Error fraction
        estimator_error = np.mean(
            np.average(incorrect, weights=weights, axis=0))
        if (estimator_error >= 0.5):
            break
        step_size = 0.5 * (np.log((1 - estimator_error) / estimator_error) +
                           np.log(1 - margin) - np.log(1 + margin))
        norm_factor = 2 * pow(estimator_error * (1 - estimator_error), 0.5)

        for i in range(sample_size):
            if (labels[i] == y_predict[i]):
                weights[i] *= np.exp(-step_size) / norm_factor
            else:
                weights[i] *= np.exp(step_size) / norm_factor
        clf_list.append([clf, step_size])
    return clf_list

Example #12

0

Show file

def BoostByMaj(features, labels, max_depth, gamma):
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    print('k ', k)
    clf_list = []
    for i in range(k):
        estimator_error = 0.6
        countdown = 10
        while ((estimator_error >= 0.5) and (countdown >= 0)):
            clf = DecisionTreeClassifier(max_depth=max_depth)
            clf = clf.fit(features, labels, sample_weight=weights)
            y_predict = clf.predict(features)
            correct_ones = y_predict == labels
            incorrect_ones = y_predict != labels
            estimator_error = np.mean(
                np.average(incorrect_ones, weights=weights, axis=0))
            unweighted_estimator_error = np.mean(
                np.average(incorrect_ones, axis=0))
            countdown -= 1
        counts += correct_ones
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - i - 1 + counts
        weights = comb(k - i - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        print('i', i, 'error', estimator_error, 'unweighted_error',
              unweighted_estimator_error, 'wnorm',
              np.linalg.norm(weights, ord=1))
        weights = weights / np.linalg.norm(weights, ord=1)
        clf_list.append([clf, 1])
    return clf_list, weights

Example #13

0

Show file

File: DecisioinTreeTest.py Project: JayceSYH/MLlib

def sklearn_titanic():
    from sklearn.tree.tree import DecisionTreeClassifier
    from sklearn.preprocessing.label import LabelEncoder
    total_df = pd.read_csv("titanic_clean.csv")
    total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True)
    total_df.dropna(inplace=True)
    for col in total_df.columns.tolist():
        if str(total_df[col].dtype) == 'object':
            total_df[col] = LabelEncoder().fit_transform(total_df[col])

    total_num = total_df.shape[0]
    train_df = total_df.iloc[:int(total_num * 0.8)]
    test_df = total_df.iloc[int(total_num * 0.8):]

    clf = DecisionTreeClassifier()
    clf.fit(train_df.drop(['survived'], axis=1), train_df['survived'])
    print(clf.score(test_df.drop(['survived'], axis=1), test_df['survived']))

Example #14

0

Show file

def DeepBBM2(features, labels, max_depth, gamma, max_depth_range):
    num_features = features.shape[1]
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    D_weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    normalizer = np.exp(1) * sample_size
    print('k ', k)
    clf_list = []
    rademacher_list = []
    for depth in max_depth_range:
        rademacher_list.append(
            calc_rademacher(depth, sample_size, num_features, normalizer))
    for t in range(k):
        best_loss = 10000
        best_error = 1
        best_depth = -1
        best_clf = DecisionTreeClassifier(max_depth=0)
        for depth in max_depth_range:
            new_clf_list, new_weights = DeepBoost(features,
                                                  labels,
                                                  1,
                                                  max_depth_range,
                                                  initial_weights=weights)

            new_clf = DecisionTreeClassifier(max_depth=depth)
            new_clf = new_clf.fit(features, labels, sample_weight=weights)
            new_error = eval_clf(new_clf, features, labels, weights)
            new_edge = new_error - 0.5
            new_sign_edge = np.sign(new_edge)
            new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth - 1]
            #             print ('new_error', new_error, 'new_grad', new_grad)
            print('depth', depth, 'new_error', new_error, 'new_grad', new_loss)
            if (new_loss < best_loss):
                best_clf = new_clf
                best_loss = new_loss
                best_error = new_error
                best_depth = depth

        y_predict = best_clf.predict(features)
        correct_ones = y_predict == labels
        counts += correct_ones
        #         if (best_error >= 0.5):
        #             break;
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts
        weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        print('i', t, 'error', best_error, 'wnorm',
              np.linalg.norm(weights, ord=1))
        weights = weights / np.linalg.norm(weights, ord=1)

        clf_list.append([best_clf, 1])
    return clf_list, weights

Example #15

0

Show file

def DeepBBM(features, labels, gamma, max_depth_range, PARAM_lambda_2):
    num_features = features.shape[1]
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    normalizer = np.exp(1) * sample_size
    # print ('k ', k)
    clf_list = []
    rademacher_list = []
    for depth_index in range(len(max_depth_range)):
        depth = max_depth_range[depth_index]
        rademacher_list.append(
            calc_rademacher(depth, sample_size, num_features, normalizer))
    for t in range(k):
        best_loss = 10000
        best_error = 1
        best_depth = -1
        best_clf = DecisionTreeClassifier(max_depth=0)
        for depth_index in range(len(max_depth_range)):
            depth = max_depth_range[depth_index]
            new_clf = DecisionTreeClassifier(max_depth=depth)
            new_clf = new_clf.fit(features, labels, sample_weight=weights)
            new_error = eval_clf(new_clf, features, labels, weights)
            new_edge = new_error - 0.5
            new_sign_edge = np.sign(new_edge)
            new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth_index]
            if (new_loss < best_loss):
                best_clf = new_clf
                best_loss = new_loss
                best_error = new_error
                best_depth = depth

        y_predict = best_clf.predict(features)
        correct_ones = y_predict == labels
        counts += correct_ones
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts
        weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        clf_list.append([best_clf, 1, best_depth])
        # print ('i', t, 'error', best_error, 'wnorm', np.linalg.norm(weights, ord=1))

        if (np.max(coeff_1) < 0):
            break

        weights = weights / np.linalg.norm(weights, ord=1)

    return clf_list, weights

Example #16

0

Show file

def BrownBoost(features, labels, max_depth, total_time):
    sample_size = features.shape[0]
    clf_list = []
    r = np.zeros(sample_size)
    weights = np.array([])
    s = total_time
    #s works as the remaining time with the initial value total_time
    T = total_time
    alpha = 0
    i = 0
    b = np.zeros(sample_size)
    while (s > 0 and i < 200):
        weights = np.exp(-(r + s)**2 / total_time)
        weights = weights / (np.sum(weights))
        clf = DecisionTreeClassifier(max_depth=max_depth)
        clf = clf.fit(features, labels, sample_weight=weights)
        y_predict = clf.predict(features)
        incorrect = y_predict != labels
        # Error fraction
        estimator_error = np.mean(
            np.average(incorrect, weights=weights, axis=0))
        print('estimator_error is', estimator_error)
        if (estimator_error >= 0.5):
            break
        for j in range(sample_size):
            if (labels[j] == y_predict[j]):
                b[j] = 1
            else:
                b[j] = -1
        a = r + s
        (t, alpha) = SolveODE(a, b, s, sample_size, T)
        r += alpha * b
        s = s - t
        print(s)
        clf_list.append([clf, alpha])
        i += 1
    return clf_list

Example #17

0

Show file

def ret_trained_DT_clf(X, Y):
    clf = DecisionTreeClassifier(max_depth=3)
    clf.fit(X, Y)
    return clf

Example #18

0

Show file

File: main.py Project: zimola/MachineLearningA1

target_b = [0 if target[i] == "ELK" else 1 for i in range(len(target))]
target_c = [0 if target[i] == "CATTLE" else 1 for i in range(len(target))]
X_train_deer, X_test_deer, y_train_deer, y_test_deer = train_test_split(
    train, target_a, random_state=0, test_size=0.3)
X_train_elk, X_test_elk, y_train_elk, y_test_elk = train_test_split(
    train, target_b, random_state=0, test_size=0.3)
X_train_cattle, X_test_cattle, y_train_cattle, y_test_cattle = train_test_split(
    train, target_c, random_state=0, test_size=0.3)

print("-----Question 1-----")
##Question 1##
###Decision Tree
print("-----DECISION TREE-----")
print("DEER confusion Matrix and accuracy score")
clf = DecisionTreeClassifier()
clf.fit(X_train_deer, y_train_deer)
y_pred = clf.predict(X_test_deer)  ##predict my y's based on x's
print(confusion_matrix(y_test_deer, y_pred))
print("Testing Score")
print(accuracy_score(y_test_deer, y_pred))  #
y_pred = clf.predict(X_train_deer)
print("Training Score")
print(accuracy_score(y_train_deer, y_pred))

print("ELK confusion matrix and accuracy score")
clf = DecisionTreeClassifier()
clf.fit(X_train_elk, y_train_elk)
y_pred = clf.predict(X_test_elk)
print(confusion_matrix(y_test_elk, y_pred))
print("Testing Score")
print(accuracy_score(y_test_elk, y_pred))

Example #19

0

Show file

File: decisiontree1.py Project: oaifaye/pyfirst

Created on 2019年1月4日

决策树
'''
import numpy as np
from sklearn.model_selection._split import train_test_split
from sklearn.metrics.classification import classification_report
from sklearn.tree.tree import DecisionTreeClassifier


def iris_type(s):
    it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
    return it[str(s, encoding="utf8")]


path = 'demo1_Iris.txt'  #
data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})

x, y = np.split(data, (4, ), axis=1)
x = x[:, :4]
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=1,
                                                    train_size=0.6)
clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(x, y.ravel())

print('feature_importances_', clf.feature_importances_)

y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

Example #20

0

Show file

File: model.py Project: johnliu/chimp

class Model(object):
    """
    The machine learning component of the tester.

    This component stores four different models:
    1) A model to decide between different types of events (drags and touches).
    2) A model to decide on the starting position for drags.
    3) A model to decide on the ending position for drags.
    4) A model to decide on the position of the touch.

    The input data are all the different known UI elements on the screen from
    the training data and whether or not they are visible on the screen.

    To acquire this, we first get the stored XML model and record the resource-id
    and class. We concatenate them into an array and mark as (1) for visible and (0)
    for not visible.
    """

    def __init__(self):
        self.symbols = {}
        self.action_data = None
        self.action_labels = None
        self.action_classifier = None
        self.drag_data = None
        self.drag_end_labels = None
        self.drag_end_classifier = None
        self.drag_start_labels = None
        self.drag_start_classifier = None
        self.touch_data = None
        self.touch_labels = None
        self.touch_classifier = None
        self.device_info = device.info

    def parse_events(self, queue):
        symbols = {"randomizer": 0}
        events = []

        all_data = []
        all_results = []
        drag_data = []
        drag_start_results = []
        drag_end_results = []
        touch_data = []
        touch_results = []

        while not queue.empty():
            event = queue.get()
            events.append(event)

            lst = event.state.start.as_list(symbols)
            lst[0] = random()
            all_data.append(lst)

            if event.action.is_drag():
                drag_data.append(lst)
                all_results.append(DRAG)

                start = event.changes.start()
                end = event.changes.end()
                drag_start_results.append(start.x * start.y)
                drag_end_results.append(end.x * end.y)

            if event.action.is_touch():
                touch_data.append(lst)
                all_results.append(TOUCH)

                start = event.changes.start()
                touch_results.append(start.x * start.y)

            if event.action.is_back():
                all_results.append(BACK)

        data = np.zeros((len(all_data), len(symbols)))
        for i, item in enumerate(all_data):
            data[i, : len(item)] = item[:]

        drags = np.zeros((len(drag_data), len(symbols)))
        for i, item in enumerate(drag_data):
            drags[i, : len(item)] = item[:]

        touches = np.zeros((len(touch_data), len(symbols)))
        for i, item in enumerate(touch_data):
            touches[i, : len(item)] = item[:]

        self.symbols = symbols

        self.action_data = data
        self.action_labels = np.array(all_results)

        self.drag_data = drags
        self.drag_start_labels = np.array(drag_start_results)
        self.drag_end_labels = np.array(drag_end_results)

        self.touch_data = touches
        self.touch_labels = np.array(touch_results)

    def train(self):
        self.action_classifier = DecisionTreeClassifier()
        self.action_classifier.fit(self.action_data, self.action_labels)

        self.drag_start_classifier = DecisionTreeRegressor()
        self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels)

        self.drag_end_classifier = DecisionTreeRegressor()
        self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels)

        self.touch_classifier = DecisionTreeRegressor()
        self.touch_classifier.fit(self.touch_data, self.touch_labels)

    def predict(self, state):
        input = state.as_list(self.symbols, False)
        input[0] = random()
        action = Action()

        type = self.action_classifier.predict(input)
        width = self.device_info["displayWidth"]
        if type == DRAG:
            start = self.drag_start_classifier.predict(input)[0]
            end = self.drag_end_classifier.predict(input)[0]
            start = Point(start % width, start / width)
            end = Point(end % width, end / width)

            action.init(ACTION_DRAG, start, end, 0.5)
        elif type == TOUCH:
            point = self.touch_classifier.predict(input)[0]
            point = Point(point % width, point / width)

            action.init(ACTION_TOUCH, point.x, point.y)
        elif type == BACK:
            action.init(ACTION_BACK)

        return action

    def save(self):
        pass

Example #21

0

Show file

def decision_tree_fit(X, y):
    clf = DecisionTreeClassifier(min_samples_leaf=5, random_state=42)
    return clf.fit(X, y)

Example #22

0

Show file

	build_iris(RidgeClassifierCV(), "RidgeIris", with_proba = False)
	build_iris(BaggingClassifier(RidgeClassifier(random_state = 13), random_state = 13, n_estimators = 3, max_features = 0.5), "RidgeEnsembleIris")
	build_iris(SGDClassifier(random_state = 13, max_iter = 100), "SGDIris", with_proba = False)
	build_iris(SGDClassifier(random_state = 13, loss = "log", max_iter = 100), "SGDLogIris")
	build_iris(SVC(), "SVCIris", with_proba = False)
	build_iris(NuSVC(), "NuSVCIris", with_proba = False)
	build_iris(VotingClassifier([("dt", DecisionTreeClassifier(random_state = 13)), ("nb", GaussianNB()), ("lr", LogisticRegression())]), "VotingEnsembleIris", with_proba = False)
	build_iris(OptimalXGBClassifier(objective = "multi:softprob", ntree_limit = 7), "XGBIris", ntree_limit = 7)

if "Iris" in datasets:
	mapper = DataFrameMapper([
		(iris_X.columns.values, ContinuousDomain())
	])
	iris_Xt = mapper.fit_transform(iris_X)
	dt_classifier = DecisionTreeClassifier(random_state = 13)
	dt_classifier.fit(iris_Xt, iris_y)
	lr_classifier = LogisticRegression(random_state = 13)
	lr_classifier.fit(iris_Xt, iris_y)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("estimator", SelectFirstEstimator([
			("X[2] <= 3", dt_classifier),
			(str(True), lr_classifier)
		]))
	])
	pipeline.active_fields = iris_X.columns.values
	pipeline.target_fields = ["Species"]
	store_pkl(pipeline, "SelectFirstIris")
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	store_csv(species, "SelectFirstIris")

Example #23

0

Show file

File: find_signature.py Project: atdi/ud120-projects


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here


from sklearn.tree.tree import DecisionTreeClassifier

vocab_list = vectorizer.get_feature_names()


dtc = DecisionTreeClassifier()
dtc.fit(features_train, labels_train)
pred = dtc.predict(features_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels_test, pred)
print(accuracy)
feature_importances = dtc.feature_importances_
for i in range(0, len(feature_importances)):
    if feature_importances[i] > 0.2:
        print("Importance = ", feature_importances[i], " number is ", i, " word is ", vocab_list[i])

Example #24

0

Show file

def main():
    print("Loading samples and labels")
    samples, labels, _ = load_files("data")
    print("Loaded {} samples".format(samples.shape[0]))

    sequence_dim = 100
    print("Converting to sequences of length {}".format(sequence_dim))
    samples, labels = make_sequences(samples, labels, sequence_dim)

    print("Number of samples from sequences: {}".format(samples.shape[0]))

    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)

    # flattened samples for Decision Tree
    flatSamples = samples.reshape(samples.shape[0], -1)  #tree!
    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(flatSamples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)

    print("=" * 20)
    print("Building DecisionTree model")
    model = DecisionTreeClassifier()
    model.fit(trainSamples, trainLabels)
    treeResults = model.predict(testSamples)
    print(
        confusion_matrix(testLabels.argmax(axis=1),
                         treeResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              treeResults.argmax(axis=1)))
    treeAcc = accuracy_score(testLabels.argmax(axis=1),
                             treeResults.argmax(axis=1))
    print("Accuracy Tree: {:.2f}".format(treeAcc))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          treeResults.argmax(axis=1))))

    print("=" * 20)
    print("Building CNN model")

    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(samples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)
    inputShape = (samples.shape[1], samples.shape[2])
    model = Sequential()
    model.add(Conv1D(32, 10, padding="same", input_shape=inputShape))
    model.add(Activation("relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(Conv1D(64, 10, padding="same"))
    model.add(Activation("relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 10, padding="same"))
    model.add(Activation("relu"))
    model.add(Dropout(0.2))
    model.add(Flatten(input_shape=inputShape))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(labels.shape[1], activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer="adam",
                  metrics=['accuracy'])

    EPOCHS = 10
    BATCH = 128
    model.fit(trainSamples,
              trainLabels,
              batch_size=BATCH,
              epochs=EPOCHS,
              validation_data=(testSamples, testLabels))

    cnnResults = model.predict(testSamples)

    print(
        confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              cnnResults.argmax(axis=1),
                              target_names=lb.classes_))
    print("CNN Accuracy: {:.2f}".format(
        accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          cnnResults.argmax(axis=1))))
    input("")

Example #25

0

Show file

File: test_classifier.py Project: fhenriquefv/Causeway

# with open('../feature_names.pickle', 'r') as pickled:
#    feature_names = pickle.load(pickled)

print "Loaded data; testing classifier..."

features_train, labels_train = ClassBalancingClassifierWrapper.rebalance(
    features_train, labels_train, ratio=2)

results = []
for i in range(15):
    print 'Round', i
    classifier = DecisionTreeClassifier()
    classifier = SKLPipeline([('feature_selection',
                               SelectPercentile(f_classif, 1)),
                              ('classification', classifier)])
    classifier.fit(features_train, labels_train)

    labels_test_predicted = classifier.predict(features_test)
    results.append(diff_binary_vectors(labels_test_predicted,
                                       labels_test_gold))

# support = classifier.steps[0][1].get_support(True)
# print 'Selected', len(support), 'features:'
# for index in support:
#    print '   ', feature_names[index]

print 'Results:'
print ClassificationMetrics.average(results, False)

# Visualize last round
'''

Example #26

0

Show file

            bestIndex = j
            bestP = data["pred_" + top16tags[j]][i]
    labelY.append(bestIndex)
    bestPossibleY.append(bestP)

print("Label Y stat:")
labelYStat = defaultdict(lambda: 0)
for ly in labelY:
    labelYStat[ly] = labelYStat[ly] + 1
for i in range(0, 16):
    print("\tindex " + str(i) + ": " + str(labelYStat[i]))

model_to_show = DecisionTreeClassifier(random_state=42, max_depth=5)
model = DecisionTreeClassifier(random_state=42, max_depth=30)
# model = RandomForestClassifier(n_estimators=25, random_state=42)
model.fit(X, labelY)
model_to_show.fit(X, labelY)

tree.export_graphviz(model_to_show,
                     out_file=OUTPUT_TREE_FILE,
                     feature_names=columns,
                     label='none')

predY = model.predict(X)

print("Pred Y stat:")
predYStat = defaultdict(lambda: 0)
for py in predY:
    predYStat[py] = predYStat[py] + 1
for i in range(0, 16):
    print("\tindex " + str(i) + ": " + str(predYStat[i]))

Example #27

0

Show file

from sklearn.metrics import f1_score
from sklearn.datasets import make_classification
from sklearn.tree.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, Y = make_classification(
    n_samples=10000, 
    n_features=1000,
    n_informative=10)

trai_x, test_x, trai_y, test_y = train_test_split(X, Y, train_size=0.8)

#
clf = DecisionTreeClassifier()
model = clf.fit(trai_x, trai_y)

trai_yp = model.predict(trai_x)
test_yp = model.predict(test_x)
print(
    model,
    "F1, Trai:%.6f, Test:%.6f" % 
    (f1_score(trai_y, trai_yp), f1_score(test_y, test_yp))
)

# 
clf = LogisticRegression()
model = clf.fit(trai_x, trai_y)

trai_yp = model.predict(trai_x)
test_yp = model.predict(test_x)

Example #28

0

Show file

def DeepBoost(features,
              labels,
              n_steps,
              max_depth_range,
              PARAM_lambda,
              PARAM_beta,
              initial_weights=None):
    num_features = features.shape[1]
    sample_size = features.shape[0]
    if (initial_weights == None):
        weights = np.ones(sample_size) / sample_size
    normalizer = np.exp(1) * sample_size
    clf_list = []
    for t in range(n_steps):
        best_error = 0
        best_grad = 0
        best_index = -1  #?
        old_tree_is_best = False
        for j in range(len(clf_list)):
            triple = clf_list[j]
            alpha = triple[1]
            if (abs(alpha) >= kTolerance):
                old_clf = triple[0]
                tree_depth = triple[2]
                error = eval_clf(old_clf, features, labels, weights)
                edge = error - 0.5
                sign_edge = np.sign(edge)
                grad = gradient(error, tree_depth, alpha, sign_edge,
                                sample_size, num_features, normalizer,
                                PARAM_lambda, PARAM_beta)
                # print ('depth', tree_depth, 'error', error, 'grad', grad)
                if (abs(grad) > abs(best_grad)):
                    best_grad = grad
                    best_error = error
                    best_index = j
                    old_tree_is_best = True
        best_depth = -1
        for depth in max_depth_range:
            new_clf = DecisionTreeClassifier(max_depth=depth)
            new_clf = new_clf.fit(features, labels, sample_weight=weights)
            new_error = eval_clf(new_clf, features, labels, weights)
            new_edge = new_error - 0.5
            new_sign_edge = np.sign(new_edge)
            new_grad = gradient(new_error, depth, 0, new_sign_edge,
                                sample_size, num_features, normalizer,
                                PARAM_lambda, PARAM_beta)
            if (abs(new_grad) > abs(best_grad)):
                best_new_clf = new_clf
                best_grad = new_grad
                best_error = new_error
                best_depth = depth
                old_tree_is_best = False
        if old_tree_is_best:
            triple = clf_list[best_index]
            alpha = triple[1]
            clf = triple[0]
            depth = triple[2]
            eta = compute_eta(best_error, depth, alpha, sample_size,
                              num_features, normalizer, PARAM_lambda,
                              PARAM_beta)
            clf_list[best_index][1] += eta
        else:
            alpha = 0
            clf = best_new_clf
            depth = best_depth
            #print ('t', t, 'best_error', best_error)
            eta = compute_eta(best_error, depth, alpha, sample_size,
                              num_features, normalizer, PARAM_lambda,
                              PARAM_beta)
            clf_list.append([clf, eta, depth])
        old_normalizer = normalizer
        normalizer = 0
        y_predict = clf.predict(features)
        for i in range(sample_size):
            if (labels[i] == y_predict[i]):
                u = eta
            else:
                u = -eta
            weights[i] *= np.exp(-u)
            normalizer += weights[i]
        weights = weights / normalizer
        normalizer = normalizer * old_normalizer
    return clf_list, weights

Example #29

0

Show file

File: ex18_2.py Project: gabormakrai/landuseregression

print("rmse: " + str(rmse))

rmse = rmseEval(data["tw"]['target'], combinedPrediction2)[1]
print("rmse: " + str(rmse))

print("identification test:")

identificationColumns = []
for c in columns["all"]:
    if c not in ['target', 'prediction', 'timestamp', 'location']:
        identificationColumns.append(c)

clf = DecisionTreeClassifier()

traintestX = generateTrainingData(data["all"], identificationColumns)

clf = clf.fit(traintestX, label)
prediction = clf.predict(traintestX)

a = accuracy_score(label, prediction)
print(str(a))

a = accuracy_score(label, prediction, normalize=False)
print(str(a))

a = confusion_matrix(label, prediction)
print(str(a))

# with open(OUTPUT_DIRECTORY + "dt.dot", 'w') as f:
#     f = tree.export_graphviz(clf, out_file=f, feature_names=identificationColumns)#, max_depth=10)

Example #30

0

Show file

def train(dataset):
    if (dataset == 'spambase'):
        features, labels, testing_features, true_labels = fetch_data_from_raw(
            'spambase')
    else:
        features, labels, testing_features, true_labels = fetch_npy_data(
            dataset)

    PARAM_lambda = 0.001
    PARAM_lambda_2 = 0.01
    PARAM_beta = 0.001
    gamma = 0.06
    tree_depth = 15
    depth_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    ## DeepBoost
    T = 200
    clf_list_db, weights = DeepBoost(features, labels, T, depth_range,
                                     PARAM_lambda, PARAM_beta)
    train_error_db = testEnsemble(clf_list_db, features, labels)
    test_error_db = testEnsemble(clf_list_db, testing_features, true_labels)

    print('db done')
    ## Deep BBM

    gamma_list = [0.15, 0.1, 0.08, 0.06]
    lambda_2_list = [1, 0.1, 0.01, 0.001, 0.0001]
    max_depth_list = [1, 2, 3, 5, 10, 15]

    # parameter search for Deep BBM
    # train_errors_dbbm = np.zeros([len(gamma_list), len(lambda_2_list), len(max_depth_list)])
    # test_errors_dbbm = np.zeros([len(gamma_list), len(lambda_2_list), len(max_depth_list)])
    # for i in range(len(gamma_list)):
    #     for j in range(len(lambda_2_list)):
    #         for k in range(len(max_depth_list)):
    #             gamma = gamma_list[i]
    #             lambda_2 = lambda_2_list[j]
    #             max_depth = max_depth_list[k]
    #             depth_range = []
    #             for l in range(max_depth):
    #                 depth_range.append(l+1)
    #             print (depth_range)
    #             clf_list_dbbm, weights = DeepBBM(features, labels, gamma, depth_range, lambda_2)
    #             train_error_dbbm = testEnsemble(clf_list_dbbm, features, labels)
    #             test_error_dbbm = testEnsemble(clf_list_dbbm, testing_features, true_labels)
    #             print ('ga', gamma, 'l2', lambda_2, 'md', max_depth, 'TrErr', train_error_dbbm, 'TeErr', test_error_dbbm)
    #             train_errors_dbbm[i, j, k] = train_error_dbbm
    #             test_errors_dbbm[i, j, k] = test_error_dbbm
    # np.save('TrErr_dbbm_ps_2', train_errors_dbbm)
    # np.save('TeErr_dbbm_ps_2', test_errors_dbbm)

    clf_list_dbbm, weights = DeepBBM(features, labels, gamma, depth_range,
                                     PARAM_lambda_2)
    train_error_dbbm = testEnsemble(clf_list_dbbm, features, labels)
    test_error_dbbm = testEnsemble(clf_list_dbbm, testing_features,
                                   true_labels)
    print('dbbm done')

    ## DecisionTreeClassifier
    dtc = DecisionTreeClassifier(max_depth=tree_depth)
    dtc = dtc.fit(features, labels)
    train_pred = dtc.predict(features)
    train_mse_dtc = ((train_pred - labels)**2).mean(axis=0)
    test_pred = dtc.predict(testing_features)
    # print (np.concatenate((np.expand_dims(pred, axis=1), np.expand_dims(true_labels, axis=1)), axis=1))
    test_mse_dtc = ((test_pred - true_labels)**2).mean(axis=0)

    ## Boost by Majority
    # gamma = 0.1
    clf_list_bbm, weights = BoostByMaj(features, labels, tree_depth, gamma)
    train_error_bbm = testEnsemble(clf_list_bbm, features, labels)
    test_error_bbm = testEnsemble(clf_list_bbm, testing_features, true_labels)
    #PlotMarginDistribution(clf_list_bbm, testing_features, true_labels)
    print('bbm done')

    # ## AdaBoost
    # T = 200
    clf_list_adb = AdaBoostClf(features, labels, tree_depth, T)
    train_error_adb = testEnsemble(clf_list_adb, features, labels)
    test_error_adb = testEnsemble(clf_list_adb, testing_features, true_labels)
    print('adb done')
    #PlotMarginDistribution(clf_list_adb, testing_features, true_labels)

    ## MarginBoost (from our homework)
    # T = 200
    margin = pow(2, -6)
    clf_list_mb = MarginBoostClf(features, labels, tree_depth, T, margin)
    train_error_mb = testEnsemble(clf_list_mb, features, labels)
    test_error_mb = testEnsemble(clf_list_mb, testing_features, true_labels)
    print('mb done')
    #PlotMarginDistribution(clf_list_mgb, testing_features, true_labels)

    ## BrownBoost
    total_time = 100
    clf_list_brown = BrownBoost(features, labels, tree_depth, total_time)
    train_error_brown = testEnsemble(clf_list_brown, features, labels)
    test_error_brown = testEnsemble(clf_list_brown, testing_features,
                                    true_labels)
    print('bb done')

    print('DeepBoost: train_error', train_error_db)
    print('DeepBoost: test_error', test_error_db)
    print('DeepBBM: train_error', train_error_dbbm)
    print('DeepBBM: test_error', test_error_dbbm)
    print('decision tree: train_mse', train_mse_dtc)
    print('decision tree: test_mse', test_mse_dtc)
    print('BBM: train_error', train_error_bbm)
    print('BBM: test_error', test_error_bbm)
    print('AdaBoost: train_error', train_error_adb)
    print('AdaBoost: test_error', test_error_adb)
    print('MarginBoost: train_error', train_error_mb)
    print('MarginBoost: test_error', test_error_mb)
    print('BrownBoost: train_error', train_error_brown)
    print('BrownBoost: test_error', test_error_brown)

Example #31

0

Show file

File: Classification.py Project: abhi-shek/Comment-Classification

def decision_tree_fit(X,y):
    clf = DecisionTreeClassifier(min_samples_leaf=5, random_state=42)
    return clf.fit(X, y)

Example #32

0

Show file

     '''
     Random forest
     '''
             
     print('Run random forest....')
     rr_model = RandomForestClassifier(n_estimators=100,max_depth=10, random_state=1)
     rr_model.fit(rel_train_X.relation_matrix, rel_train_Y.values)
     
     rf_pred_train = rr_model.predict(rel_train_X.relation_matrix)
     train_result.append(('RF', evaluateByF1(rf_pred_train, rel_train_Y.values)))
     
     rf_pred_test = rr_model.predict(test_X)
     test_result.append(('RF', evaluateByF1(rf_pred_test, test_Y)))
     
     
     print('Run decision tree....')
     id3_model = DecisionTreeClassifier(max_depth=10, random_state=1)
     id3_model.fit(rel_train_X.relation_matrix, rel_train_Y.values)
     
     id3_pred_train = id3_model.predict(rel_train_X.relation_matrix)
     train_result.append(('ID3', evaluateByF1(id3_pred_train, rel_train_Y.values)))
     
     id3_pred_test = id3_model.predict(test_X)
     test_result.append(('ID3', evaluateByF1(id3_pred_test, test_Y)))
     
     print('Performance of CBA and CMAR with different measures:')
     printList(train_result)
     printList(test_result)

Example #33

0

Show file

from numpy import loadtxt
from sklearn.model_selection import train_test_split
from sklearn.tree.tree import DecisionTreeClassifier

func = lambda x: 0.0 if x == b'False' else 1.0
all_data = loadtxt('flare.csv',
                   delimiter=',',
                   skiprows=1,
                   converters={32: func})
target = all_data[:, -1]
data = all_data[:, 0:-1]
train_x, test_x, train_y, test_y = train_test_split(data,
                                                    target,
                                                    test_size=0.25,
                                                    random_state=100)

# SEIF A -  Train the tree without trimming
clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(train_x, train_y)
success_rate = clf.score(test_x, test_y)

# SEIF B - Trim to leaves smaller or equal to 20
clf = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=20)
clf.fit(train_x, train_y)
success_rate = clf.score(test_x, test_y)

Example #34

0

Show file

y_train_age = data[pd.notnull(data.Age)][['Age']]
regresor.fit(X_train_age, y_train_age)

# TODO: sprawdzić tą predykcję wieku, działa chyba ok
# data['AgePredicted'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), None)
data['Age'] = np.where(pd.isnull(data.Age),
                       regresor.predict(data[['Title', 'SibSp', 'Parch']]),
                       data['Age'])

##predykcja poziomu
classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2)
#

X_train_floor = data[pd.notnull(data.Floor)][['Embarked', 'Pclass']]
y_train_floor = data[pd.notnull(data.Floor)]['Floor'].values.astype('int')
classifier.fit(X_train_floor, y_train_floor)

data['Floor'] = np.where(pd.isnull(data.Floor),
                         classifier.predict(data[['Embarked', 'Pclass']]),
                         data['Floor'])

##zmiana ceny za bilet
data['TicketCounts'] = data.groupby('Ticket')['Ticket'].transform('count')

data['Fare'] = data['Fare'] / data['TicketCounts']

##usunięcie nieużywanych kolumn
data = data.drop(['Ticket', 'Cabin', 'Name', 'SibSp', 'Parch', 'TicketCounts'],
                 axis=1)

##zalozenie indeksu na kolumnie

Example #35

0

Show file

File: main.py Project: dtpryce/jpmml-sklearn

store_pkl(audit_mapper, "Audit.pkl")

audit_X = audit[:, 0:48]
audit_y = audit[:, 48]

audit_y = audit_y.astype(int)

print(audit_X.dtype, audit_y.dtype)

def predict_audit(classifier):
    adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"])
    adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"])
    return pandas.concat((adjusted, adjusted_proba), axis = 1)

audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5)
audit_tree.fit(audit_X, audit_y)

store_pkl(audit_tree, "DecisionTreeAudit.pkl")
store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv")

audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5)
audit_forest.fit(audit_X, audit_y)

store_pkl(audit_forest, "RandomForestAudit.pkl")
store_csv(predict_audit(audit_forest), "RandomForestAudit.csv")

audit_regression = LogisticRegression()
audit_regression.fit(audit_X, audit_y)

store_pkl(audit_regression, "RegressionAudit.pkl")
store_csv(predict_audit(audit_regression), "RegressionAudit.csv")

Example #36

0

Show file

File: trees.py Project: TheGrimmScientist/AgileMachineLearning

def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict):
	clf = DecisionTreeClassifier()
	clf.fit(X, y)
	current_state_to_predict = np.array(current_state_to_predict).reshape(1,-1)
	predicted_state = clf.predict(current_state_to_predict)
	return predicted_state

Example #37

0

Show file

File: decisionTreeGraphviz.py Project: hellomyzn/TCAI

    dot_filename = mkstemp(suffix='.dot', dir=tmp_dir)[1]
    with open(dot_filename, "w") as out_file:
        export_graphviz(clf, out_file=out_file,
                        feature_names=feature_names,
                        class_names=class_names,
                        filled=True, rounded=True,
                        special_characters=True)

    from IPython.display import Image

    image_filename = image_filename or ('%s.png' % dot_filename)

    subprocess.call(('dot -Tpng -o %s %s' %
                     (image_filename, dot_filename)).split(' '))
    image = Image(filename=image_filename)
    os.remove(dot_filename)
    return image


from sklearn import datasets
from sklearn.tree.tree import DecisionTreeClassifier
%matplotlib inline

iris = datasets.load_iris()
X = iris.data
Y = iris.target

clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, Y)
convert_decision_tree_to_ipython_image(clf, image_filename='tree.png')