def get_best_c45_classifier(train, label_col, skf_tune):
    c45 = C45Constructor()
    cfs = np.arange(0.05, 1.05, 0.05)
    cfs_errors = {}
    for cf in cfs:
        cfs_errors[cf] = []

    for train_tune_idx, val_tune_idx in skf_tune:
        train_tune = train.iloc[train_tune_idx, :]
        X_train_tune = train_tune.drop(label_col, axis=1)
        y_train_tune = train_tune[label_col]
        val_tune = train.iloc[val_tune_idx, :]
        X_val_tune = val_tune.drop(label_col, axis=1)
        y_val_tune = val_tune[label_col]
        for cf in cfs:
            c45.cf = cf
            tree = c45.construct_tree(X_train_tune, y_train_tune)
            predictions = tree.evaluate_multiple(X_val_tune).astype(int)
            cfs_errors[cf].append(
                1 - accuracy_score(predictions, y_val_tune, normalize=True))

    for cf in cfs:
        cfs_errors[cf] = np.mean(cfs_errors[cf])

    c45.cf = min(cfs_errors.items(), key=operator.itemgetter(1))[0]
    return c45
Example #2
0
        feature_column_names[best_features_rf[k]]]
for k in range(num_features_boruta):
    new_features_boruta[feature_column_names[best_features_boruta[
        k]]] = features_df[feature_column_names[best_features_boruta[k]]]

confusion_matrices = {}
confusion_matrices['RF'] = []
confusion_matrices['Boruta'] = []

features_df_rf = new_features_rf
features_df_boruta = new_features_boruta

feature_column_names_rf = list(set(features_df_rf.columns) - set(['cat']))
feature_column_names_boruta = list(
    set(features_df_boruta.columns) - set(['cat']))
c45 = C45Constructor(cf=0.15)

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'],
                                               n_folds=N_FOLDS,
                                               shuffle=True,
                                               random_state=SEED)

for train_index, test_index in skf:
    train_features_df_rf, test_features_df_rf = features_df_rf.iloc[
        train_index, :].copy(), features_df_rf.iloc[test_index, :].copy()
    train_features_df_boruta, test_features_df_boruta = features_df_boruta.iloc[
        train_index, :].copy(), features_df_boruta.iloc[test_index, :].copy()
    train_labels_df, test_labels_df = labels_df.iloc[
        train_index, :].copy(), labels_df.iloc[test_index, :].copy()

    train_features_df_rf = train_features_df_rf.reset_index(drop=True)
feature_maxs = {}
feature_column_names = list(set(df.columns) - set(['disease']))

for feature in feature_column_names:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])
df = df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['disease'].copy()
features_df = df.copy()
features_df = features_df.drop('disease', axis=1)
features_df = features_df / features_df.max()
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=0.01)
cart = CARTConstructor(min_samples_leaf=10, max_depth=6)
quest = QuestConstructor(default=1,
                         max_nr_nodes=1,
                         discrete_thresh=25,
                         alpha=0.05)
tree_constructors = [c45, cart, quest]

tree_confusion_matrices = {}
titles = ["C4.5", "Boosted C4.5", "Genetic"]
for title in titles:
    tree_confusion_matrices[title] = []

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'],
                                               n_folds=N_FOLDS,
                                               shuffle=True,
feature_mins = {}
feature_maxs = {}
feature_column_names = list(set(df.columns) - set(['class']))

for feature in feature_column_names:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])
df = df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['class'].copy()
features_df = df.copy()
features_df = features_df.drop('class', axis=1)
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=0.95)
cart = CARTConstructor(max_depth=12, min_samples_leaf=2)
quest = QuestConstructor(default=1,
                         max_nr_nodes=1,
                         discrete_thresh=10,
                         alpha=0.99)
# c45 = C45Constructor(cf=0.75)
# cart = CARTConstructor(max_depth=10, min_samples_leaf=2)
# quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=10, alpha=0.9)
tree_constructors = [c45, cart, quest]

tree_confusion_matrices = {}
for tree_constructor in tree_constructors:
    tree_confusion_matrices[tree_constructor.get_name()] = []
tree_confusion_matrices["Genetic"] = []
feature_mins = {}
feature_maxs = {}
feature_column_names = list(set(df.columns) - set(['Name']))

for feature in feature_column_names:
        feature_mins[feature] = np.min(df[feature])
        feature_maxs[feature] = np.max(df[feature])
df=df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['Name'].copy()
features_df = df.copy()
features_df = features_df.drop('Name', axis=1)
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=1.0)
cart = CARTConstructor(max_depth=5, min_samples_leaf=2)
quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=1, alpha=0.0000001)
tree_constructors = [c45, cart, quest]

tree_confusion_matrices = {}
for tree_constructor in tree_constructors:
    tree_confusion_matrices[tree_constructor.get_name()] = []
tree_confusion_matrices["Genetic"] = []

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED)

for train_index, test_index in skf:
    train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy()
    train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
    train_features_df = train_features_df.reset_index(drop=True)
Example #6
0
feature_maxs = {}
feature_column_names = list(set(df.columns) - set(['Name']))

for feature in feature_column_names:
        feature_mins[feature] = np.min(df[feature])
        feature_maxs[feature] = np.max(df[feature])
df=df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['Name'].copy()
features_df = df.copy()
features_df = features_df.drop('Name', axis=1)
features_df = features_df/features_df.max()
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=0.75)

tree_confusion_matrices = {}
titles = ["Unaugmented C4.5", "Augmented C4.5"]

skf = StratifiedKFold(labels_df['cat'], n_folds=5, shuffle=True, random_state=SEED)

for train_index, test_index in skf:
    train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy()
    train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
    train_features_df = train_features_df.reset_index(drop=True)
    test_features_df = test_features_df.reset_index(drop=True)
    train_labels_df = train_labels_df.reset_index(drop=True)
    test_labels_df = test_labels_df.reset_index(drop=True)
    train_df = train_features_df.copy()
    train_df['cat'] = train_labels_df['cat'].copy()
Example #7
0
        df[feature] += np.min(df[feature]) * (-1)
        feature_mins[feature] = 0
    else:
        feature_mins[feature] = np.min(df[feature])

    feature_maxs[feature] = np.max(df[feature])

df=df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['class'].copy()
features_df = df.copy()
features_df = features_df.drop('class', axis=1)
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=0.65)
cart = CARTConstructor(min_samples_leaf=5, max_depth=6)
quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=1, alpha=0.25)
tree_constructors = [c45, cart, quest]
# tree_constructors = [quest]

tree_confusion_matrices = {}
for tree_constructor in tree_constructors:
    tree_confusion_matrices[tree_constructor.get_name()] = []
tree_confusion_matrices["Genetic"] = []

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=False, random_state=SEED)

for train_index, test_index in skf:
    train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy()
    train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
Example #8
0
import time
from sklearn.cross_validation import StratifiedShuffleSplit

from constructors.c45orangeconstructor import C45Constructor
from constructors.treemerger import DecisionTreeMerger
from constructors.treemerger_clean import DecisionTreeMergerClean
from data.load_datasets import load_led7

import numpy as np

merger = DecisionTreeMergerClean()
merger2 = DecisionTreeMerger()
df, features, label, name = load_led7()
c45 = C45Constructor()

skf = StratifiedShuffleSplit(df[label], 1, test_size=0.5, random_state=1337)

feature_mins = {}
feature_maxs = {}
for feature in features:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])

for fold, (train_idx, test_idx) in enumerate(skf):
    # print 'Fold', fold+1, '/', NR_FOLDS
    train = df.iloc[train_idx, :].reset_index(drop=True)
    X_train = train.drop(label, axis=1)
    y_train = train[label]
    test = df.iloc[test_idx, :].reset_index(drop=True)
    X_test = test.drop(label, axis=1)
    y_test = test[label]
Example #9
0
from constructors.c45orangeconstructor import C45Constructor
from constructors.treemerger import DecisionTreeMerger
from constructors.treemerger_clean import DecisionTreeMergerClean
from data.load_datasets import load_austra
from sklearn import preprocessing

import numpy as np
import pandas as pd
import collections
import operator

merger = DecisionTreeMergerClean()
merger2 = DecisionTreeMerger()
df, features, label, name = load_austra()
c45 = C45Constructor(cf=0.0)

skf = StratifiedShuffleSplit(df[label], 1, test_size=0.25, random_state=1337)

feature_mins = {}
feature_maxs = {}
for feature in features:
    feature_mins[feature] = 0.0
    feature_maxs[feature] = 1.0

for fold, (train_idx, test_idx) in enumerate(skf):
    # print 'Fold', fold+1, '/', NR_FOLDS
    train = df.iloc[train_idx, :].reset_index(drop=True)
    X_train = train.drop(label, axis=1)
    x = X_train.values  #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
Example #10
0
df = df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['disease'].copy()
features_df = df.copy()
features_df = features_df.drop('disease', axis=1)
train_labels_df = labels_df
train_features_df = features_df[['max heartrate', 'resting blood pressure']]
feature_mins = {}
feature_maxs = {}
feature_column_names = ['max heartrate', 'resting blood pressure']
for feature in feature_column_names:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])

merger = DecisionTreeMerger()
# cart = CARTConstructor(min_samples_leaf=10, max_depth=2)
cart = C45Constructor(cf=1.0)
tree = cart.construct_tree(train_features_df, train_labels_df)
tree.populate_samples(train_features_df, train_labels_df['cat'])
tree.visualise("2d_tree")

regions = merger.decision_tree_to_decision_table(tree, train_features_df)
print regions
merger.plot_regions("2d_regions",
                    regions, ['1', '2'],
                    "max heartrate",
                    "resting blood pressure",
                    y_max=feature_maxs["resting blood pressure"],
                    x_max=feature_maxs["max heartrate"],
                    y_min=feature_mins["resting blood pressure"],
                    x_min=feature_mins["max heartrate"])