Ejemplos de DecisionTreeClassifier en Python, ejemplos de sklearn.tree.tree.DecisionTreeClassifier en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: gesture_classification.py Proyecto: homer-robotics/gesture_recognition_single_images

def init_classifiers(model_condig, observations, target):
    classifiers = {}

    for key in model_condig.keys():
        print 'Initializing classigier ', key
        if key == 'svm':
            best_estimator = search_best_param_for_model(
                key, SVC(), model_condig[key], observations, target)
            classifiers[key] = best_estimator

        if key == 'decision_tree':
            best_estimator = search_best_param_for_model(
                key, DecisionTreeClassifier(), model_condig[key], observations,
                target)
            classifiers[key] = best_estimator

        if key == 'random_forest':
            best_estimator = search_best_param_for_model(
                key, RandomForestClassifier(), model_condig[key], observations,
                target)
            classifiers[key] = best_estimator

        if key == 'adaboost':
            best_estimator = search_best_param_for_model(
                key,
                AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
                adjust_adaboost_param(model_condig[key]), observations, target)
            classifiers[key] = best_estimator

    return classifiers

Ejemplo n.º 2

0

Mostrar archivo

def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict):
    clf = DecisionTreeClassifier()
    clf.fit(X, y)
    current_state_to_predict = np.array(current_state_to_predict).reshape(
        1, -1)
    predicted_state = clf.predict(current_state_to_predict)
    return predicted_state

Ejemplo n.º 3

0

Mostrar archivo

def BoostByMaj(features, labels, max_depth, gamma):
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    print('k ', k)
    clf_list = []
    for i in range(k):
        estimator_error = 0.6
        countdown = 10
        while ((estimator_error >= 0.5) and (countdown >= 0)):
            clf = DecisionTreeClassifier(max_depth=max_depth)
            clf = clf.fit(features, labels, sample_weight=weights)
            y_predict = clf.predict(features)
            correct_ones = y_predict == labels
            incorrect_ones = y_predict != labels
            estimator_error = np.mean(
                np.average(incorrect_ones, weights=weights, axis=0))
            unweighted_estimator_error = np.mean(
                np.average(incorrect_ones, axis=0))
            countdown -= 1
        counts += correct_ones
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - i - 1 + counts
        weights = comb(k - i - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        print('i', i, 'error', estimator_error, 'unweighted_error',
              unweighted_estimator_error, 'wnorm',
              np.linalg.norm(weights, ord=1))
        weights = weights / np.linalg.norm(weights, ord=1)
        clf_list.append([clf, 1])
    return clf_list, weights

Ejemplo n.º 4

0

Mostrar archivo

def MarginBoostClf(features, labels, max_depth, n_steps, margin):
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    clf_list = []
    for t in range(n_steps):
        clf = DecisionTreeClassifier(max_depth=max_depth)
        clf = clf.fit(features, labels, sample_weight=weights)
        y_predict = clf.predict(features)
        incorrect = y_predict != labels
        # Error fraction
        estimator_error = np.mean(
            np.average(incorrect, weights=weights, axis=0))
        if (estimator_error >= 0.5):
            break
        step_size = 0.5 * (np.log((1 - estimator_error) / estimator_error) +
                           np.log(1 - margin) - np.log(1 + margin))
        norm_factor = 2 * pow(estimator_error * (1 - estimator_error), 0.5)

        for i in range(sample_size):
            if (labels[i] == y_predict[i]):
                weights[i] *= np.exp(-step_size) / norm_factor
            else:
                weights[i] *= np.exp(step_size) / norm_factor
        clf_list.append([clf, step_size])
    return clf_list

Ejemplo n.º 5

0

Mostrar archivo

Archivo: trainer.py Proyecto: rajpirathap/ModelAnsGenProj

def train_ensemble_decision_tree_classifier():
    #min_samples_split, min_samples_leaf, max_leaf_nodes, splitter
    classifier1 = SklearnClassifier(DecisionTreeClassifier(random_state=0),
                                    sparse=False)
    classifier2 = SklearnClassifier(DecisionTreeClassifier(max_depth=20,
                                                           min_samples_split=3,
                                                           min_samples_leaf=4,
                                                           max_leaf_nodes=35,
                                                           splitter='best',
                                                           random_state=0),
                                    sparse=False)
    classifier3 = SklearnClassifier(DecisionTreeClassifier(max_depth=30,
                                                           min_samples_split=2,
                                                           min_samples_leaf=2,
                                                           max_leaf_nodes=40,
                                                           splitter='best',
                                                           random_state=0),
                                    sparse=False)
    test_classifiers = []
    test_classifiers.append(classifier1)
    test_classifiers.append(classifier2)
    test_classifiers.append(classifier3)

    trained_classifiers = []

    for classifier in test_classifiers:
        classifier = classifier.train(train_features)
        trained_classifiers.append(classifier)

    voted_classifier = VoteClassifier(trained_classifiers)
    save_classifier(voted_classifier, 'voted_classifier_decision_tree.pickle')

    print_and_get_split_dataset_accuracy(test_classifiers, train_features)
    print_voted_classifier_cross_validation_experiment_result(
        test_classifiers, train_features)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: stacked_classifier.py Proyecto: zuphilip/Quadflor

    def fit(self, X, y):
        """
        Fit the NearestCentroid model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        """

        self.y = y
        if self.fit_base:
            self.base_classifier.fit(X, y)
        distances = self.base_classifier.predict_proba(X)

        topNIndices, topNDistances = self._get_top_labels(distances)
        training_data = self._extract_features(topNIndices, topNDistances, y,
                                               distances)

        # create a decision tree for each label
        self.meta_classifiers = {}
        for label, training_samples_of_label in training_data.items():
            training_samples_of_label = np.matrix(training_samples_of_label)
            decision_tree = DecisionTreeClassifier(criterion="gini")
            decision_tree.fit(training_samples_of_label[:, 0:-1],
                              training_samples_of_label[:, -1:])
            self.meta_classifiers[label] = decision_tree

Ejemplo n.º 7

0

Mostrar archivo

Archivo: stacked_classifier.py Proyecto: quadflor/Quadflor

    def fit(self, X, y):
        """
        Fit the NearestCentroid model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        """

        self.y = y
        if self.fit_base:
            self.base_classifier.fit(X, y)
        distances = self.base_classifier.predict_proba(X)

        topNIndices, topNDistances = self._get_top_labels(distances)
        training_data = self._extract_features(topNIndices, topNDistances, y, distances)

        # create a decision tree for each label
        self.meta_classifiers = {}
        for label, training_samples_of_label in training_data.items():
            training_samples_of_label = np.matrix(training_samples_of_label)
            decision_tree = DecisionTreeClassifier(criterion="gini")
            decision_tree.fit(training_samples_of_label[:, 0:-1], training_samples_of_label[:, -1:])
            self.meta_classifiers[label] = decision_tree

Ejemplo n.º 8

0

Mostrar archivo

def dtree(X, y, model_path):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)

Ejemplo n.º 9

0

Mostrar archivo

def wrapper_for_decision_tree_accuracy(X, y, relative_test_size):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=relative_test_size, random_state=42)

    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = accuracy_score(pred, y_test)

    return score

Ejemplo n.º 10

0

Mostrar archivo

Archivo: trees.py Proyecto: TheGrimmScientist/AgileMachineLearning

def wrapper_for_decision_tree_accuracy(X, y, relative_test_size):
	X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=relative_test_size, random_state=42)


	clf = DecisionTreeClassifier()
	clf.fit(X_train, y_train)
	pred = clf.predict(X_test)
	score = accuracy_score(pred,y_test)

	return score

Ejemplo n.º 11

0

Mostrar archivo

Archivo: feature_based.py Proyecto: fakenewssyria/fake_news_detection

    def create_decision_tree(self):
        ''' based on experiments our best model was the decision tree model with the following params: '''

        tree = DecisionTreeClassifier(max_depth=65,
                                      min_samples_split=0.03,
                                      min_samples_leaf=3,
                                      max_features=8)
        tree.fit(self.X_train, self.Y_train)
        predicted_y = tree.predict(self.X_test)
        print(predicted_y)
        self.print_stats(predicted_y, "")
        self.test_df['learning_label'] = predicted_y
        self.test_df.to_csv('output/feature_extraction.csv',
                            encoding="latin-1")  # save the training dataset

Ejemplo n.º 12

0

Mostrar archivo

 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self

Ejemplo n.º 13

0

Mostrar archivo

def decision_tree():
    print "Run Decision Tree"
    pipeline = Pipeline([('count', CountVectorizer(ngram_range=(1, 2))),
                         ('tfidf', TfidfTransformer()),
                         ('classify', DecisionTreeClassifier())])

    print "Splitting into training and testing"
    cutoff = np.random.rand(len(data)) < 0.7
    train = data[cutoff]
    test = data[~cutoff]

    conversationsX = train["conversation"].values
    conversationsY = train["category"].values

    testX = test["conversation"].values
    testY = test["category"].values

    predictX = testSet["conversation"].values

    pipeline.fit(conversationsX, conversationsY)
    testYResults = pipeline.predict(testX)

    report = classification_report(testY, testYResults)
    print report

    predictions = pipeline.predict(predictX)
    return predictions

Ejemplo n.º 14

0

Mostrar archivo

def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")

Ejemplo n.º 15

0

Mostrar archivo

Archivo: trainer.py Proyecto: rajpirathap/ModelAnsGenProj

def train_ensemble_classifier():
    # classifier2 = SklearnClassifier(GaussianNB(), sparse=False)
    # classifier1 = SklearnClassifier(SVC(), sparse=False)
    # classifier3 = SklearnClassifier(RandomForestClassifier(), sparse=False)
    # classifier4 = SklearnClassifier(DecisionTreeClassifier(), sparse=False)
    classifier2 = SklearnClassifier(GaussianNB(), sparse=False)
    classifier1 = SklearnClassifier(SVC(degree=18, C=12), sparse=False)
    classifier3 = SklearnClassifier(RandomForestClassifier(max_depth=100,
                                                           n_estimators=10),
                                    sparse=False)
    classifier4 = SklearnClassifier(DecisionTreeClassifier(min_samples_split=2,
                                                           min_samples_leaf=2,
                                                           max_leaf_nodes=30,
                                                           splitter='best',
                                                           random_state=0),
                                    sparse=False)
    test_classifiers = []
    test_classifiers.append(classifier1)
    test_classifiers.append(classifier2)
    test_classifiers.append(classifier3)
    test_classifiers.append(classifier4)

    trained_classifiers = []

    for classifier in test_classifiers:
        classifier = classifier.train(train_features)
        trained_classifiers.append(classifier)

    voted_classifier = VoteClassifier(trained_classifiers)
    save_classifier(voted_classifier, 'voted_classifier.pickle')

    print_and_get_split_dataset_accuracy(test_classifiers, train_features)
    print_voted_classifier_cross_validation_experiment_result(
        test_classifiers, train_features)

Ejemplo n.º 16

0

Mostrar archivo

 def __init__(self,
              criterion='gini',
              splitter='best',
              max_depth=None,
              min_samples_split=2,
              min_samples_leaf=1,
              min_weight_fraction_leaf=0.0,
              max_features=None,
              random_state=None,
              max_leaf_nodes=None,
              min_impurity_decrease=0.0,
              min_impurity_split=None,
              class_weight='balanced',
              presort=False):
     self._hyperparams = {
         'criterion': criterion,
         'splitter': splitter,
         'max_depth': max_depth,
         'min_samples_split': min_samples_split,
         'min_samples_leaf': min_samples_leaf,
         'min_weight_fraction_leaf': min_weight_fraction_leaf,
         'max_features': max_features,
         'random_state': random_state,
         'max_leaf_nodes': max_leaf_nodes,
         'min_impurity_decrease': min_impurity_decrease,
         'min_impurity_split': min_impurity_split,
         'class_weight': class_weight,
         'presort': presort
     }
     self._wrapped_model = Op(**self._hyperparams)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: sklearn_supervised.py Proyecto: renjunxiang/Kaggle

def sklearn_supervised(data=None,
                       label=None,
                       model_savepath='./models/classify.model',
                       model_name='SVM',
                       **sklearn_param):
    '''
    :param data: 训练文本
    :param label: 训练文本的标签
    :param model_savepath: 模型保存路径
    :param model_name: 机器学习分类模型,SVM,KNN,Logistic
    :param return: 训练好的模型
    '''

    if model_name == 'KNN':
        # 调用KNN,近邻=5
        model = KNeighborsClassifier(**sklearn_param)
        model.fit(data, label)
    elif model_name == 'SVM':
        # 核函数为linear,惩罚系数为1.0
        model = SVC(**sklearn_param)
        model.fit(data, label)
    elif model_name == 'Logistic':
        model = LogisticRegression(**sklearn_param)  # 核函数为线性,惩罚系数为1
        model.fit(data, label)
    elif model_name == 'DecisionTree':
        model = DecisionTreeClassifier(**sklearn_param)
        model.fit(data, label)
    elif model_name == 'Naivebayes':
        model = GaussianNB()
        model.fit(data, label)

    if model_savepath != None:
        joblib.dump(model, model_savepath)  # 保存模型

    return model

Ejemplo n.º 18

0

Mostrar archivo

 def __init__(self, data, protectedIndex, protectedValue, numRounds=20,
              weakLearner=DecisionTreeClassifier(), computeError=boosting.weightedLabelError):
    self.splitData(data)
    _, self.hypotheses, self.alphas = boosting.detailedBoost(
       self.trainingData, numRounds, weakLearner, computeError)
    super().__init__(defaultThreshold=0, marginRange=(-1, 1), protectedIndex=protectedIndex,
                     protectedValue=protectedValue)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: main.py Proyecto: yang-wang-ck/jpmml-sklearn

def build_audit(classifier, name, with_proba = True, **pmml_options):
	continuous_mapper = DataFrameMapper([
		(["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
	])
	categorical_mapper = DataFrameMapper([
		(["Employment"], [CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state = 13))]),
		(["Education"], [CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state = 13, n_estimators = 3), threshold = "1.25 * mean")]),
		(["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		(["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]),
		(["Gender"], [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		(["Deductions"], [CategoricalDomain()]),
	])
	pipeline = Pipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: Prove04.py Proyecto: jmsplmr/cs450_prove01

def compare_sklearn_dt(chess_data, chess_target, credit_data, credit_target,
                       iris_data, iris_target, lens_data, lens_target,
                       vote_data, vote_targets):
    sk_dt = DecisionTreeClassifier(max_depth=5, min_samples_split=10)

    iris_scores = cross_val_score(sk_dt, iris_data, iris_target, cv=10)
    print('(SK-IRIS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        iris_scores.mean() * 100,
        iris_scores.std() * 2))
    votes_scores = cross_val_score(sk_dt, lens_data, lens_target, cv=10)
    print('(SK-LENS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        votes_scores.mean() * 100,
        votes_scores.std() * 2))
    votes_scores = cross_val_score(sk_dt, vote_data, vote_targets, cv=10)
    print('(SK-VOTES) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        votes_scores.mean() * 100,
        votes_scores.std() * 2))
    credit_scores = cross_val_score(sk_dt, credit_data, credit_target, cv=10)
    print('(SK-CREDIT) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        credit_scores.mean() * 100,
        credit_scores.std() * 2))
    chess_scores = cross_val_score(sk_dt, chess_data, chess_target, cv=10)
    print('(SK-CHESS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        chess_scores.mean() * 100,
        chess_scores.std() * 2))

Ejemplo n.º 21

0

Mostrar archivo

Archivo: DecisioinTreeTest.py Proyecto: JayceSYH/MLlib

def sklearn_titanic():
    from sklearn.tree.tree import DecisionTreeClassifier
    from sklearn.preprocessing.label import LabelEncoder
    total_df = pd.read_csv("titanic_clean.csv")
    total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True)
    total_df.dropna(inplace=True)
    for col in total_df.columns.tolist():
        if str(total_df[col].dtype) == 'object':
            total_df[col] = LabelEncoder().fit_transform(total_df[col])

    total_num = total_df.shape[0]
    train_df = total_df.iloc[:int(total_num * 0.8)]
    test_df = total_df.iloc[int(total_num * 0.8):]

    clf = DecisionTreeClassifier()
    clf.fit(train_df.drop(['survived'], axis=1), train_df['survived'])
    print(clf.score(test_df.drop(['survived'], axis=1), test_df['survived']))

Ejemplo n.º 22

0

Mostrar archivo

Archivo: serialize.py Proyecto: kyokagong/python3learning

def use():
    # test use
    from sklearn.tree.tree import DecisionTreeClassifier
    import sklearn.datasets
    path = 'model.pkl'
    iris = sklearn.datasets.load_iris()
    model = DecisionTreeClassifier()
    # a = None
    # try:
    #     a.test()
    # except Exception as e:
    #     traceback.print_exc()

    train(model, iris.data, iris.target)
    save(model, path)
    model = load(path)
    print(model.predict(iris.data))

Ejemplo n.º 23

0

Mostrar archivo

 def forest_fit(self, X, y):
     for i in range(self.n_estimators):
         self.trees["tree{}".format(i)] = DecisionTreeClassifier(
             max_features='auto')
         self.trees["tree{}".format(i)].fit(X, y)
         if i % 5 == 0:
             self.trees["SVM{}".format(i)] = SVC()
             self.trees["SVM{}".format(i)].fit(X, y)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: trainer.py Proyecto: rajpirathap/ModelAnsGenProj

def train_individual_classifier():
    #classifier = SklearnClassifier(SVC(), sparse=False)
    classifier = SklearnClassifier(DecisionTreeClassifier(random_state=0),
                                   sparse=False)
    # classifier = SklearnClassifier(GaussianNB(), sparse=False)
    # classifier = SklearnClassifier(RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), sparse=False)
    print_cross_validation_experiment_result(classifier, train_features)
    classifier.train(train_features)
    save_classifier(classifier, 'my_classifier.pickle')

Ejemplo n.º 25

0

Mostrar archivo

Archivo: gesture_classification.py Proyecto: homer-robotics/gesture_recognition_single_images

def adjust_adaboost_param(tuning_param):
    if tuning_param['base_estimator_name'] == 'DecisionTreeClassifier':
        tuning_param['base_estimator'] = []

        for max_feature in tuning_param['base_estimator_max_features']:
            tuning_param['base_estimator'].append(
                DecisionTreeClassifier(max_features=max_feature))

        tuning_param.pop('base_estimator_name')
        tuning_param.pop('base_estimator_max_features')
        return tuning_param

Ejemplo n.º 26

0

Mostrar archivo

def boost(trainingData,
          numRounds=20,
          weakLearner=DecisionTreeClassifier(),
          computeError=weightedLabelError):
    generator = adaboostGenerator(trainingData, weakLearner, numRounds,
                                  computeError)

    for h, _, _ in generator:
        pass

    return h

Ejemplo n.º 27

0

Mostrar archivo

Archivo: model.py Proyecto: johnliu/chimp

    def train(self):
        self.action_classifier = DecisionTreeClassifier()
        self.action_classifier.fit(self.action_data, self.action_labels)

        self.drag_start_classifier = DecisionTreeRegressor()
        self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels)

        self.drag_end_classifier = DecisionTreeRegressor()
        self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels)

        self.touch_classifier = DecisionTreeRegressor()
        self.touch_classifier.fit(self.touch_data, self.touch_labels)

Ejemplo n.º 28

0

Mostrar archivo

def decision_tree_training_sets():
    training_set_sizes = [.1,.25,.5,.75,.9]

    columns = ['Training Set Size', 'Training Score', 'Test Score', 'Train Time', 'Test Time']
    df = pd.DataFrame(columns=columns)

    for training_set_size in training_set_sizes:
        X_train, X_test, y_train, y_test = train_test_split(
            encoded_data[list(set(encoded_data.columns) - set(['Target']))],
            encoded_data['Target'], train_size=training_set_size)
        scaler = preprocessing.StandardScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float32')), columns=X_train.columns)
        X_test = scaler.transform(X_test.astype('float32'))

        start_train = time.time()
        dt = DecisionTreeClassifier(max_depth=8)
        print(dt)
        dt.fit(X_train, y_train)
        end_train = time.time() - start_train

        train_score = dt.score(X_train, y_train)
        start_test = time.time()
        test_score = dt.score(X_test, y_test)
        end_test = time.time() - start_test

        values = [training_set_size, train_score, test_score, end_train, end_test]
        df.loc[len(df)] = values

        print(' '.join(str(col) for col in columns))
        print(' '.join(str(val) for val in values))

    df.to_excel('diabetes_dt_training_sets.xls')

Ejemplo n.º 29

0

Mostrar archivo

class DecisionTreeClassifierImpl():

    def __init__(self, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight='balanced', presort=False):
        self._hyperparams = {
            'criterion': criterion,
            'splitter': splitter,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'random_state': random_state,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'class_weight': class_weight,
            'presort': presort}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)

Ejemplo n.º 30

0

Mostrar archivo

def decision_tree_depths():
    max_depths = [2, 4, 6, 8, 10, 12, 16, 18, 20, 25, 30, 40]

    columns = [
        'Max Depths', 'Training Score', 'Test Score', 'Train Time', 'Test Time'
    ]
    df = pd.DataFrame(columns=columns)

    for depth in max_depths:
        start_train = time.time()
        dt = DecisionTreeClassifier(max_depth=depth)
        print(dt)
        dt.fit(X_train, y_train)
        end_train = time.time() - start_train

        train_score = dt.score(X_train, y_train)
        start_test = time.time()
        test_score = dt.score(X_test, y_test)
        end_test = time.time() - start_test

        values = [depth, train_score, test_score, end_train, end_test]
        df.loc[len(df)] = values

        print(' '.join(str(col) for col in columns))
        print(' '.join(str(val) for val in values))

    df.to_excel('adult_dt.xls')

Ejemplo n.º 31

0

Mostrar archivo

Archivo: main.py Proyecto: waveleu/jpmml-sklearn

def build_audit(classifier, name, with_proba=True, **kwargs):
    continuous_mapper = DataFrameMapper([("Age", ContinuousDomain()),
                                         ("Income", ContinuousDomain()),
                                         ("Hours", ContinuousDomain())])
    categorical_mapper = DataFrameMapper([
        ("Employment", [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(DecisionTreeClassifier(random_state=13))
        ]),
        ("Education", [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(RandomForestClassifier(random_state=13,
                                                   n_estimators=3),
                            threshold="1.25 * mean")
        ]),
        ("Marital", [
            CategoricalDomain(),
            LabelBinarizer(neg_label=-1, pos_label=1),
            SelectKBest(k=3)
        ]),
        ("Occupation",
         [CategoricalDomain(),
          LabelBinarizer(),
          SelectKBest(k=3)]),
        ("Gender",
         [CategoricalDomain(),
          LabelBinarizer(neg_label=-3, pos_label=3)]),
        ("Deductions", [CategoricalDomain(),
                        LabelEncoder()]),
    ])
    pipeline = Pipeline([
        ("union",
         FeatureUnion([("continuous", continuous_mapper),
                       ("categorical",
                        Pipeline([("mapper", categorical_mapper),
                                  ("polynomial", PolynomialFeatures())]))])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")

Ejemplo n.º 32

0

Mostrar archivo

def detailedBoost(trainingData,
                  numRounds=20,
                  weakLearner=DecisionTreeClassifier(),
                  computeError=weightedLabelError,
                  diagnostic=None):
    generator = adaboostGenerator(trainingData, weakLearner, numRounds,
                                  computeError)

    for h, hypotheses, alphas in generator:
        if diagnostic is not None:
            diagnostic({'h': h, 'hypoheses': hypotheses, 'alphas': alphas})

    return h, hypotheses, alphas

Ejemplo n.º 33

0

Mostrar archivo

def DeepBBM2(features, labels, max_depth, gamma, max_depth_range):
    num_features = features.shape[1]
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    D_weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    normalizer = np.exp(1) * sample_size
    print('k ', k)
    clf_list = []
    rademacher_list = []
    for depth in max_depth_range:
        rademacher_list.append(
            calc_rademacher(depth, sample_size, num_features, normalizer))
    for t in range(k):
        best_loss = 10000
        best_error = 1
        best_depth = -1
        best_clf = DecisionTreeClassifier(max_depth=0)
        for depth in max_depth_range:
            new_clf_list, new_weights = DeepBoost(features,
                                                  labels,
                                                  1,
                                                  max_depth_range,
                                                  initial_weights=weights)

            new_clf = DecisionTreeClassifier(max_depth=depth)
            new_clf = new_clf.fit(features, labels, sample_weight=weights)
            new_error = eval_clf(new_clf, features, labels, weights)
            new_edge = new_error - 0.5
            new_sign_edge = np.sign(new_edge)
            new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth - 1]
            #             print ('new_error', new_error, 'new_grad', new_grad)
            print('depth', depth, 'new_error', new_error, 'new_grad', new_loss)
            if (new_loss < best_loss):
                best_clf = new_clf
                best_loss = new_loss
                best_error = new_error
                best_depth = depth

        y_predict = best_clf.predict(features)
        correct_ones = y_predict == labels
        counts += correct_ones
        #         if (best_error >= 0.5):
        #             break;
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts
        weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        print('i', t, 'error', best_error, 'wnorm',
              np.linalg.norm(weights, ord=1))
        weights = weights / np.linalg.norm(weights, ord=1)

        clf_list.append([best_clf, 1])
    return clf_list, weights

Ejemplo n.º 34

0

Mostrar archivo

Archivo: trees.py Proyecto: TheGrimmScientist/AgileMachineLearning

def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict):
	clf = DecisionTreeClassifier()
	clf.fit(X, y)
	current_state_to_predict = np.array(current_state_to_predict).reshape(1,-1)
	predicted_state = clf.predict(current_state_to_predict)
	return predicted_state

Ejemplo n.º 35

0

Mostrar archivo

Archivo: model.py Proyecto: johnliu/chimp

class Model(object):
    """
    The machine learning component of the tester.

    This component stores four different models:
    1) A model to decide between different types of events (drags and touches).
    2) A model to decide on the starting position for drags.
    3) A model to decide on the ending position for drags.
    4) A model to decide on the position of the touch.

    The input data are all the different known UI elements on the screen from
    the training data and whether or not they are visible on the screen.

    To acquire this, we first get the stored XML model and record the resource-id
    and class. We concatenate them into an array and mark as (1) for visible and (0)
    for not visible.
    """

    def __init__(self):
        self.symbols = {}
        self.action_data = None
        self.action_labels = None
        self.action_classifier = None
        self.drag_data = None
        self.drag_end_labels = None
        self.drag_end_classifier = None
        self.drag_start_labels = None
        self.drag_start_classifier = None
        self.touch_data = None
        self.touch_labels = None
        self.touch_classifier = None
        self.device_info = device.info

    def parse_events(self, queue):
        symbols = {"randomizer": 0}
        events = []

        all_data = []
        all_results = []
        drag_data = []
        drag_start_results = []
        drag_end_results = []
        touch_data = []
        touch_results = []

        while not queue.empty():
            event = queue.get()
            events.append(event)

            lst = event.state.start.as_list(symbols)
            lst[0] = random()
            all_data.append(lst)

            if event.action.is_drag():
                drag_data.append(lst)
                all_results.append(DRAG)

                start = event.changes.start()
                end = event.changes.end()
                drag_start_results.append(start.x * start.y)
                drag_end_results.append(end.x * end.y)

            if event.action.is_touch():
                touch_data.append(lst)
                all_results.append(TOUCH)

                start = event.changes.start()
                touch_results.append(start.x * start.y)

            if event.action.is_back():
                all_results.append(BACK)

        data = np.zeros((len(all_data), len(symbols)))
        for i, item in enumerate(all_data):
            data[i, : len(item)] = item[:]

        drags = np.zeros((len(drag_data), len(symbols)))
        for i, item in enumerate(drag_data):
            drags[i, : len(item)] = item[:]

        touches = np.zeros((len(touch_data), len(symbols)))
        for i, item in enumerate(touch_data):
            touches[i, : len(item)] = item[:]

        self.symbols = symbols

        self.action_data = data
        self.action_labels = np.array(all_results)

        self.drag_data = drags
        self.drag_start_labels = np.array(drag_start_results)
        self.drag_end_labels = np.array(drag_end_results)

        self.touch_data = touches
        self.touch_labels = np.array(touch_results)

    def train(self):
        self.action_classifier = DecisionTreeClassifier()
        self.action_classifier.fit(self.action_data, self.action_labels)

        self.drag_start_classifier = DecisionTreeRegressor()
        self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels)

        self.drag_end_classifier = DecisionTreeRegressor()
        self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels)

        self.touch_classifier = DecisionTreeRegressor()
        self.touch_classifier.fit(self.touch_data, self.touch_labels)

    def predict(self, state):
        input = state.as_list(self.symbols, False)
        input[0] = random()
        action = Action()

        type = self.action_classifier.predict(input)
        width = self.device_info["displayWidth"]
        if type == DRAG:
            start = self.drag_start_classifier.predict(input)[0]
            end = self.drag_end_classifier.predict(input)[0]
            start = Point(start % width, start / width)
            end = Point(end % width, end / width)

            action.init(ACTION_DRAG, start, end, 0.5)
        elif type == TOUCH:
            point = self.touch_classifier.predict(input)[0]
            point = Point(point % width, point / width)

            action.init(ACTION_TOUCH, point.x, point.y)
        elif type == BACK:
            action.init(ACTION_BACK)

        return action

    def save(self):
        pass

Ejemplo n.º 36

0

Mostrar archivo

Archivo: find_signature.py Proyecto: atdi/ud120-projects


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here


from sklearn.tree.tree import DecisionTreeClassifier

vocab_list = vectorizer.get_feature_names()


dtc = DecisionTreeClassifier()
dtc.fit(features_train, labels_train)
pred = dtc.predict(features_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels_test, pred)
print(accuracy)
feature_importances = dtc.feature_importances_
for i in range(0, len(feature_importances)):
    if feature_importances[i] > 0.2:
        print("Importance = ", feature_importances[i], " number is ", i, " word is ", vocab_list[i])

Ejemplo n.º 37

0

Mostrar archivo

Archivo: main.py Proyecto: dtpryce/jpmml-sklearn

store_pkl(audit_mapper, "Audit.pkl")

audit_X = audit[:, 0:48]
audit_y = audit[:, 48]

audit_y = audit_y.astype(int)

print(audit_X.dtype, audit_y.dtype)

def predict_audit(classifier):
    adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"])
    adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"])
    return pandas.concat((adjusted, adjusted_proba), axis = 1)

audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5)
audit_tree.fit(audit_X, audit_y)

store_pkl(audit_tree, "DecisionTreeAudit.pkl")
store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv")

audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5)
audit_forest.fit(audit_X, audit_y)

store_pkl(audit_forest, "RandomForestAudit.pkl")
store_csv(predict_audit(audit_forest), "RandomForestAudit.csv")

audit_regression = LogisticRegression()
audit_regression.fit(audit_X, audit_y)

store_pkl(audit_regression, "RegressionAudit.pkl")

Ejemplo n.º 38

0

Mostrar archivo

Archivo: Classification.py Proyecto: abhi-shek/Comment-Classification

def decision_tree_fit(X,y):
    clf = DecisionTreeClassifier(min_samples_leaf=5, random_state=42)
    return clf.fit(X, y)