Example #1
0
def train_ensemble_decision_tree_classifier():
    #min_samples_split, min_samples_leaf, max_leaf_nodes, splitter
    classifier1 = SklearnClassifier(DecisionTreeClassifier(random_state=0),
                                    sparse=False)
    classifier2 = SklearnClassifier(DecisionTreeClassifier(max_depth=20,
                                                           min_samples_split=3,
                                                           min_samples_leaf=4,
                                                           max_leaf_nodes=35,
                                                           splitter='best',
                                                           random_state=0),
                                    sparse=False)
    classifier3 = SklearnClassifier(DecisionTreeClassifier(max_depth=30,
                                                           min_samples_split=2,
                                                           min_samples_leaf=2,
                                                           max_leaf_nodes=40,
                                                           splitter='best',
                                                           random_state=0),
                                    sparse=False)
    test_classifiers = []
    test_classifiers.append(classifier1)
    test_classifiers.append(classifier2)
    test_classifiers.append(classifier3)

    trained_classifiers = []

    for classifier in test_classifiers:
        classifier = classifier.train(train_features)
        trained_classifiers.append(classifier)

    voted_classifier = VoteClassifier(trained_classifiers)
    save_classifier(voted_classifier, 'voted_classifier_decision_tree.pickle')

    print_and_get_split_dataset_accuracy(test_classifiers, train_features)
    print_voted_classifier_cross_validation_experiment_result(
        test_classifiers, train_features)
def init_classifiers(model_condig, observations, target):
    classifiers = {}

    for key in model_condig.keys():
        print 'Initializing classigier ', key
        if key == 'svm':
            best_estimator = search_best_param_for_model(
                key, SVC(), model_condig[key], observations, target)
            classifiers[key] = best_estimator

        if key == 'decision_tree':
            best_estimator = search_best_param_for_model(
                key, DecisionTreeClassifier(), model_condig[key], observations,
                target)
            classifiers[key] = best_estimator

        if key == 'random_forest':
            best_estimator = search_best_param_for_model(
                key, RandomForestClassifier(), model_condig[key], observations,
                target)
            classifiers[key] = best_estimator

        if key == 'adaboost':
            best_estimator = search_best_param_for_model(
                key,
                AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
                adjust_adaboost_param(model_condig[key]), observations, target)
            classifiers[key] = best_estimator

    return classifiers
Example #3
0
def DeepBBM2(features, labels, max_depth, gamma, max_depth_range):
    num_features = features.shape[1]
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    D_weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    normalizer = np.exp(1) * sample_size
    print('k ', k)
    clf_list = []
    rademacher_list = []
    for depth in max_depth_range:
        rademacher_list.append(
            calc_rademacher(depth, sample_size, num_features, normalizer))
    for t in range(k):
        best_loss = 10000
        best_error = 1
        best_depth = -1
        best_clf = DecisionTreeClassifier(max_depth=0)
        for depth in max_depth_range:
            new_clf_list, new_weights = DeepBoost(features,
                                                  labels,
                                                  1,
                                                  max_depth_range,
                                                  initial_weights=weights)

            new_clf = DecisionTreeClassifier(max_depth=depth)
            new_clf = new_clf.fit(features, labels, sample_weight=weights)
            new_error = eval_clf(new_clf, features, labels, weights)
            new_edge = new_error - 0.5
            new_sign_edge = np.sign(new_edge)
            new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth - 1]
            #             print ('new_error', new_error, 'new_grad', new_grad)
            print('depth', depth, 'new_error', new_error, 'new_grad', new_loss)
            if (new_loss < best_loss):
                best_clf = new_clf
                best_loss = new_loss
                best_error = new_error
                best_depth = depth

        y_predict = best_clf.predict(features)
        correct_ones = y_predict == labels
        counts += correct_ones
        #         if (best_error >= 0.5):
        #             break;
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts
        weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        print('i', t, 'error', best_error, 'wnorm',
              np.linalg.norm(weights, ord=1))
        weights = weights / np.linalg.norm(weights, ord=1)

        clf_list.append([best_clf, 1])
    return clf_list, weights
Example #4
0
def DeepBBM(features, labels, gamma, max_depth_range, PARAM_lambda_2):
    num_features = features.shape[1]
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    normalizer = np.exp(1) * sample_size
    # print ('k ', k)
    clf_list = []
    rademacher_list = []
    for depth_index in range(len(max_depth_range)):
        depth = max_depth_range[depth_index]
        rademacher_list.append(
            calc_rademacher(depth, sample_size, num_features, normalizer))
    for t in range(k):
        best_loss = 10000
        best_error = 1
        best_depth = -1
        best_clf = DecisionTreeClassifier(max_depth=0)
        for depth_index in range(len(max_depth_range)):
            depth = max_depth_range[depth_index]
            new_clf = DecisionTreeClassifier(max_depth=depth)
            new_clf = new_clf.fit(features, labels, sample_weight=weights)
            new_error = eval_clf(new_clf, features, labels, weights)
            new_edge = new_error - 0.5
            new_sign_edge = np.sign(new_edge)
            new_loss = new_error + PARAM_lambda_2 * rademacher_list[depth_index]
            if (new_loss < best_loss):
                best_clf = new_clf
                best_loss = new_loss
                best_error = new_error
                best_depth = depth

        y_predict = best_clf.predict(features)
        correct_ones = y_predict == labels
        counts += correct_ones
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - t - 1 + counts
        weights = comb(k - t - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        clf_list.append([best_clf, 1, best_depth])
        # print ('i', t, 'error', best_error, 'wnorm', np.linalg.norm(weights, ord=1))

        if (np.max(coeff_1) < 0):
            break

        weights = weights / np.linalg.norm(weights, ord=1)

    return clf_list, weights
Example #5
0
def sklearn_supervised(data=None,
                       label=None,
                       model_savepath='./models/classify.model',
                       model_name='SVM',
                       **sklearn_param):
    '''
    :param data: 训练文本
    :param label: 训练文本的标签
    :param model_savepath: 模型保存路径
    :param model_name: 机器学习分类模型,SVM,KNN,Logistic
    :param return: 训练好的模型
    '''

    if model_name == 'KNN':
        # 调用KNN,近邻=5
        model = KNeighborsClassifier(**sklearn_param)
        model.fit(data, label)
    elif model_name == 'SVM':
        # 核函数为linear,惩罚系数为1.0
        model = SVC(**sklearn_param)
        model.fit(data, label)
    elif model_name == 'Logistic':
        model = LogisticRegression(**sklearn_param)  # 核函数为线性,惩罚系数为1
        model.fit(data, label)
    elif model_name == 'DecisionTree':
        model = DecisionTreeClassifier(**sklearn_param)
        model.fit(data, label)
    elif model_name == 'Naivebayes':
        model = GaussianNB()
        model.fit(data, label)

    if model_savepath != None:
        joblib.dump(model, model_savepath)  # 保存模型

    return model
Example #6
0
    def fit(self, X, y):
        """
        Fit the NearestCentroid model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
            Note that centroid shrinking cannot be used with sparse matrices.
        y : array, shape = [n_samples]
            Target values (integers)
        """

        self.y = y
        if self.fit_base:
            self.base_classifier.fit(X, y)
        distances = self.base_classifier.predict_proba(X)

        topNIndices, topNDistances = self._get_top_labels(distances)
        training_data = self._extract_features(topNIndices, topNDistances, y,
                                               distances)

        # create a decision tree for each label
        self.meta_classifiers = {}
        for label, training_samples_of_label in training_data.items():
            training_samples_of_label = np.matrix(training_samples_of_label)
            decision_tree = DecisionTreeClassifier(criterion="gini")
            decision_tree.fit(training_samples_of_label[:, 0:-1],
                              training_samples_of_label[:, -1:])
            self.meta_classifiers[label] = decision_tree
Example #7
0
def compare_sklearn_dt(chess_data, chess_target, credit_data, credit_target,
                       iris_data, iris_target, lens_data, lens_target,
                       vote_data, vote_targets):
    sk_dt = DecisionTreeClassifier(max_depth=5, min_samples_split=10)

    iris_scores = cross_val_score(sk_dt, iris_data, iris_target, cv=10)
    print('(SK-IRIS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        iris_scores.mean() * 100,
        iris_scores.std() * 2))
    votes_scores = cross_val_score(sk_dt, lens_data, lens_target, cv=10)
    print('(SK-LENS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        votes_scores.mean() * 100,
        votes_scores.std() * 2))
    votes_scores = cross_val_score(sk_dt, vote_data, vote_targets, cv=10)
    print('(SK-VOTES) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        votes_scores.mean() * 100,
        votes_scores.std() * 2))
    credit_scores = cross_val_score(sk_dt, credit_data, credit_target, cv=10)
    print('(SK-CREDIT) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        credit_scores.mean() * 100,
        credit_scores.std() * 2))
    chess_scores = cross_val_score(sk_dt, chess_data, chess_target, cv=10)
    print('(SK-CHESS) Accuracy: {0:.2f}% (+/- {1:.2f})'.format(
        chess_scores.mean() * 100,
        chess_scores.std() * 2))
Example #8
0
def train_ensemble_classifier():
    # classifier2 = SklearnClassifier(GaussianNB(), sparse=False)
    # classifier1 = SklearnClassifier(SVC(), sparse=False)
    # classifier3 = SklearnClassifier(RandomForestClassifier(), sparse=False)
    # classifier4 = SklearnClassifier(DecisionTreeClassifier(), sparse=False)
    classifier2 = SklearnClassifier(GaussianNB(), sparse=False)
    classifier1 = SklearnClassifier(SVC(degree=18, C=12), sparse=False)
    classifier3 = SklearnClassifier(RandomForestClassifier(max_depth=100,
                                                           n_estimators=10),
                                    sparse=False)
    classifier4 = SklearnClassifier(DecisionTreeClassifier(min_samples_split=2,
                                                           min_samples_leaf=2,
                                                           max_leaf_nodes=30,
                                                           splitter='best',
                                                           random_state=0),
                                    sparse=False)
    test_classifiers = []
    test_classifiers.append(classifier1)
    test_classifiers.append(classifier2)
    test_classifiers.append(classifier3)
    test_classifiers.append(classifier4)

    trained_classifiers = []

    for classifier in test_classifiers:
        classifier = classifier.train(train_features)
        trained_classifiers.append(classifier)

    voted_classifier = VoteClassifier(trained_classifiers)
    save_classifier(voted_classifier, 'voted_classifier.pickle')

    print_and_get_split_dataset_accuracy(test_classifiers, train_features)
    print_voted_classifier_cross_validation_experiment_result(
        test_classifiers, train_features)
Example #9
0
def MarginBoostClf(features, labels, max_depth, n_steps, margin):
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    clf_list = []
    for t in range(n_steps):
        clf = DecisionTreeClassifier(max_depth=max_depth)
        clf = clf.fit(features, labels, sample_weight=weights)
        y_predict = clf.predict(features)
        incorrect = y_predict != labels
        # Error fraction
        estimator_error = np.mean(
            np.average(incorrect, weights=weights, axis=0))
        if (estimator_error >= 0.5):
            break
        step_size = 0.5 * (np.log((1 - estimator_error) / estimator_error) +
                           np.log(1 - margin) - np.log(1 + margin))
        norm_factor = 2 * pow(estimator_error * (1 - estimator_error), 0.5)

        for i in range(sample_size):
            if (labels[i] == y_predict[i]):
                weights[i] *= np.exp(-step_size) / norm_factor
            else:
                weights[i] *= np.exp(step_size) / norm_factor
        clf_list.append([clf, step_size])
    return clf_list
Example #10
0
 def __init__(self, data, protectedIndex, protectedValue, numRounds=20,
              weakLearner=DecisionTreeClassifier(), computeError=boosting.weightedLabelError):
    self.splitData(data)
    _, self.hypotheses, self.alphas = boosting.detailedBoost(
       self.trainingData, numRounds, weakLearner, computeError)
    super().__init__(defaultThreshold=0, marginRange=(-1, 1), protectedIndex=protectedIndex,
                     protectedValue=protectedValue)
Example #11
0
def decision_tree_training_sets():
    training_set_sizes = [.1,.25,.5,.75,.9]

    columns = ['Training Set Size', 'Training Score', 'Test Score', 'Train Time', 'Test Time']
    df = pd.DataFrame(columns=columns)

    for training_set_size in training_set_sizes:
        X_train, X_test, y_train, y_test = train_test_split(
            encoded_data[list(set(encoded_data.columns) - set(['Target']))],
            encoded_data['Target'], train_size=training_set_size)
        scaler = preprocessing.StandardScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float32')), columns=X_train.columns)
        X_test = scaler.transform(X_test.astype('float32'))

        start_train = time.time()
        dt = DecisionTreeClassifier(max_depth=8)
        print(dt)
        dt.fit(X_train, y_train)
        end_train = time.time() - start_train

        train_score = dt.score(X_train, y_train)
        start_test = time.time()
        test_score = dt.score(X_test, y_test)
        end_test = time.time() - start_test

        values = [training_set_size, train_score, test_score, end_train, end_test]
        df.loc[len(df)] = values

        print(' '.join(str(col) for col in columns))
        print(' '.join(str(val) for val in values))

    df.to_excel('diabetes_dt_training_sets.xls')
Example #12
0
def decision_tree():
    print "Run Decision Tree"
    pipeline = Pipeline([('count', CountVectorizer(ngram_range=(1, 2))),
                         ('tfidf', TfidfTransformer()),
                         ('classify', DecisionTreeClassifier())])

    print "Splitting into training and testing"
    cutoff = np.random.rand(len(data)) < 0.7
    train = data[cutoff]
    test = data[~cutoff]

    conversationsX = train["conversation"].values
    conversationsY = train["category"].values

    testX = test["conversation"].values
    testY = test["category"].values

    predictX = testSet["conversation"].values

    pipeline.fit(conversationsX, conversationsY)
    testYResults = pipeline.predict(testX)

    report = classification_report(testY, testYResults)
    print report

    predictions = pipeline.predict(predictX)
    return predictions
Example #13
0
def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Example #14
0
def wrapper_for_decision_tree_in_sklearn(X, y, current_state_to_predict):
    clf = DecisionTreeClassifier()
    clf.fit(X, y)
    current_state_to_predict = np.array(current_state_to_predict).reshape(
        1, -1)
    predicted_state = clf.predict(current_state_to_predict)
    return predicted_state
Example #15
0
def build_audit(classifier, name, with_proba = True, **pmml_options):
	continuous_mapper = DataFrameMapper([
		(["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
	])
	categorical_mapper = DataFrameMapper([
		(["Employment"], [CategoricalDomain(), LabelBinarizer(), SelectFromModel(DecisionTreeClassifier(random_state = 13))]),
		(["Education"], [CategoricalDomain(), LabelBinarizer(), SelectFromModel(RandomForestClassifier(random_state = 13, n_estimators = 3), threshold = "1.25 * mean")]),
		(["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		(["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]),
		(["Gender"], [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		(["Deductions"], [CategoricalDomain()]),
	])
	pipeline = Pipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Example #16
0
def BoostByMaj(features, labels, max_depth, gamma):
    sample_size = features.shape[0]
    weights = np.ones(sample_size) / sample_size
    counts = np.zeros(sample_size)
    k_pre = get_k_from_gamma(gamma, sample_size)
    k = k_pre
    #k = min(600, k_pre)
    print('k ', k)
    clf_list = []
    for i in range(k):
        estimator_error = 0.6
        countdown = 10
        while ((estimator_error >= 0.5) and (countdown >= 0)):
            clf = DecisionTreeClassifier(max_depth=max_depth)
            clf = clf.fit(features, labels, sample_weight=weights)
            y_predict = clf.predict(features)
            correct_ones = y_predict == labels
            incorrect_ones = y_predict != labels
            estimator_error = np.mean(
                np.average(incorrect_ones, weights=weights, axis=0))
            unweighted_estimator_error = np.mean(
                np.average(incorrect_ones, axis=0))
            countdown -= 1
        counts += correct_ones
        coeff_1 = int(np.floor(k / 2)) - counts
        coeff_2 = int(np.ceil(k / 2)) - i - 1 + counts
        weights = comb(k - i - 1, coeff_1) * pow(0.5 + gamma, coeff_1) * pow(
            0.5 - gamma, coeff_2)

        print('i', i, 'error', estimator_error, 'unweighted_error',
              unweighted_estimator_error, 'wnorm',
              np.linalg.norm(weights, ord=1))
        weights = weights / np.linalg.norm(weights, ord=1)
        clf_list.append([clf, 1])
    return clf_list, weights
Example #17
0
def decision_tree_depths():
    max_depths = [2, 4, 6, 8, 10, 12, 16, 18, 20, 25, 30, 40]

    columns = [
        'Max Depths', 'Training Score', 'Test Score', 'Train Time', 'Test Time'
    ]
    df = pd.DataFrame(columns=columns)

    for depth in max_depths:
        start_train = time.time()
        dt = DecisionTreeClassifier(max_depth=depth)
        print(dt)
        dt.fit(X_train, y_train)
        end_train = time.time() - start_train

        train_score = dt.score(X_train, y_train)
        start_test = time.time()
        test_score = dt.score(X_test, y_test)
        end_test = time.time() - start_test

        values = [depth, train_score, test_score, end_train, end_test]
        df.loc[len(df)] = values

        print(' '.join(str(col) for col in columns))
        print(' '.join(str(val) for val in values))

    df.to_excel('adult_dt.xls')
Example #18
0
 def forest_fit(self, X, y):
     for i in range(self.n_estimators):
         self.trees["tree{}".format(i)] = DecisionTreeClassifier(
             max_features='auto')
         self.trees["tree{}".format(i)].fit(X, y)
         if i % 5 == 0:
             self.trees["SVM{}".format(i)] = SVC()
             self.trees["SVM{}".format(i)].fit(X, y)
Example #19
0
def dtree(X, y, model_path):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
Example #20
0
def train_individual_classifier():
    #classifier = SklearnClassifier(SVC(), sparse=False)
    classifier = SklearnClassifier(DecisionTreeClassifier(random_state=0),
                                   sparse=False)
    # classifier = SklearnClassifier(GaussianNB(), sparse=False)
    # classifier = SklearnClassifier(RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), sparse=False)
    print_cross_validation_experiment_result(classifier, train_features)
    classifier.train(train_features)
    save_classifier(classifier, 'my_classifier.pickle')
Example #21
0
def wrapper_for_decision_tree_accuracy(X, y, relative_test_size):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=relative_test_size, random_state=42)

    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = accuracy_score(pred, y_test)

    return score
Example #22
0
def boost(trainingData,
          numRounds=20,
          weakLearner=DecisionTreeClassifier(),
          computeError=weightedLabelError):
    generator = adaboostGenerator(trainingData, weakLearner, numRounds,
                                  computeError)

    for h, _, _ in generator:
        pass

    return h
def adjust_adaboost_param(tuning_param):
    if tuning_param['base_estimator_name'] == 'DecisionTreeClassifier':
        tuning_param['base_estimator'] = []

        for max_feature in tuning_param['base_estimator_max_features']:
            tuning_param['base_estimator'].append(
                DecisionTreeClassifier(max_features=max_feature))

        tuning_param.pop('base_estimator_name')
        tuning_param.pop('base_estimator_max_features')
        return tuning_param
Example #24
0
def build_audit(classifier, name, with_proba=True, **kwargs):
    continuous_mapper = DataFrameMapper([("Age", ContinuousDomain()),
                                         ("Income", ContinuousDomain()),
                                         ("Hours", ContinuousDomain())])
    categorical_mapper = DataFrameMapper([
        ("Employment", [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(DecisionTreeClassifier(random_state=13))
        ]),
        ("Education", [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(RandomForestClassifier(random_state=13,
                                                   n_estimators=3),
                            threshold="1.25 * mean")
        ]),
        ("Marital", [
            CategoricalDomain(),
            LabelBinarizer(neg_label=-1, pos_label=1),
            SelectKBest(k=3)
        ]),
        ("Occupation",
         [CategoricalDomain(),
          LabelBinarizer(),
          SelectKBest(k=3)]),
        ("Gender",
         [CategoricalDomain(),
          LabelBinarizer(neg_label=-3, pos_label=3)]),
        ("Deductions", [CategoricalDomain(),
                        LabelEncoder()]),
    ])
    pipeline = Pipeline([
        ("union",
         FeatureUnion([("continuous", continuous_mapper),
                       ("categorical",
                        Pipeline([("mapper", categorical_mapper),
                                  ("polynomial", PolynomialFeatures())]))])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Example #25
0
def detailedBoost(trainingData,
                  numRounds=20,
                  weakLearner=DecisionTreeClassifier(),
                  computeError=weightedLabelError,
                  diagnostic=None):
    generator = adaboostGenerator(trainingData, weakLearner, numRounds,
                                  computeError)

    for h, hypotheses, alphas in generator:
        if diagnostic is not None:
            diagnostic({'h': h, 'hypoheses': hypotheses, 'alphas': alphas})

    return h, hypotheses, alphas
    def create_decision_tree(self):
        ''' based on experiments our best model was the decision tree model with the following params: '''

        tree = DecisionTreeClassifier(max_depth=65,
                                      min_samples_split=0.03,
                                      min_samples_leaf=3,
                                      max_features=8)
        tree.fit(self.X_train, self.Y_train)
        predicted_y = tree.predict(self.X_test)
        print(predicted_y)
        self.print_stats(predicted_y, "")
        self.test_df['learning_label'] = predicted_y
        self.test_df.to_csv('output/feature_extraction.csv',
                            encoding="latin-1")  # save the training dataset
Example #27
0
 def test_train_generates_random_experts(self):
     instances, labels = numpy.random.randn(50, 10), numpy.random.choice(
         ["a", "b", "c"], size=50)
     tests = numpy.random.randn(50, 10)
     ft = EnsembleTrainer(base_estimator=DecisionTreeClassifier(),
                          centroid_picker=RandomCentroidPicker(),
                          weigher_sampler=ExponentialWeigher(1, 2))
     experts = ft.train(2, instances, labels)
     self.assertEqual(2, len(experts))
     self.assertFalse(
         numpy.array_equal(experts[0].centroid, experts[1].centroid))
     self.assertFalse(
         numpy.array_equal(experts[0].predict(tests),
                           experts[1].predict(tests)))
Example #28
0
    def __init__(self, uniform_variables, knn=50, iterations=10,
                 base_estimator=DecisionTreeClassifier(max_depth=6),
                 train_variables=None, learning_rate=10, efficiencies_as_sum=True):
        """This classifier tries to obtain flat efficiency in signal by
        changing the weights of training sample. Doesn't use boosting or whatever

        :type base_estimator: BaseEstimator
        """
        self.base_estimator = base_estimator
        self.uniform_variables = uniform_variables
        self.knn = knn
        self.iterations = iterations
        self.train_variables = train_variables
        self.learning_rate = learning_rate
        self.efficiencies_as_sum = efficiencies_as_sum
Example #29
0
    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        super(AdaCostClassifier, self)._validate_estimator(default=DecisionTreeClassifier(max_depth=1))

        #  SAMME-R requires predict_proba-enabled base estimators
        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator_, 'predict_proba'):
                raise TypeError(
                    "AdaCostClassifier with algorithm='SAMME.R' requires "
                    "that the weak learner supports the calculation of class "
                    "probabilities with a predict_proba method.\n"
                    "Please change the base estimator or set "
                    "algorithm='SAMME' instead.")
        if not has_fit_parameter(self.base_estimator_, "sample_weight"):
            raise ValueError("%s doesn't support sample_weight."
                             % self.base_estimator_.__class__.__name__)
Example #30
0
def sklearn_titanic():
    from sklearn.tree.tree import DecisionTreeClassifier
    from sklearn.preprocessing.label import LabelEncoder
    total_df = pd.read_csv("titanic_clean.csv")
    total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True)
    total_df.dropna(inplace=True)
    for col in total_df.columns.tolist():
        if str(total_df[col].dtype) == 'object':
            total_df[col] = LabelEncoder().fit_transform(total_df[col])

    total_num = total_df.shape[0]
    train_df = total_df.iloc[:int(total_num * 0.8)]
    test_df = total_df.iloc[int(total_num * 0.8):]

    clf = DecisionTreeClassifier()
    clf.fit(train_df.drop(['survived'], axis=1), train_df['survived'])
    print(clf.score(test_df.drop(['survived'], axis=1), test_df['survived']))