def get_best_cart_classifier(train, label_col, skf_tune):
    cart = CARTConstructor()
    max_depths = np.arange(1, 21, 2)
    max_depths = np.append(max_depths, None)
    min_samples_splits = np.arange(1, 20, 1)

    errors = {}
    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:
            errors[(max_depth, min_samples_split)] = []

    for train_tune_idx, val_tune_idx in skf_tune:
        train_tune = train.iloc[train_tune_idx, :]
        X_train_tune = train_tune.drop(label_col, axis=1)
        y_train_tune = train_tune[label_col]
        val_tune = train.iloc[val_tune_idx, :]
        X_val_tune = val_tune.drop(label_col, axis=1)
        y_val_tune = val_tune[label_col]
        for max_depth in max_depths:
            for min_samples_split in min_samples_splits:
                cart.max_depth = max_depth
                cart.min_samples_split = min_samples_split
                tree = cart.construct_tree(X_train_tune, y_train_tune)
                predictions = tree.evaluate_multiple(X_val_tune).astype(int)
                errors[((max_depth, min_samples_split))].append(
                    1 -
                    accuracy_score(predictions, y_val_tune, normalize=True))

    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:
            errors[(max_depth,
                    min_samples_split)] = np.mean(errors[(max_depth,
                                                          min_samples_split)])

    best_params = min(errors.items(), key=operator.itemgetter(1))[0]
    cart.max_depth = best_params[0]
    cart.min_samples_split = best_params[1]

    return cart
feature_column_names = list(set(df.columns) - set(['disease']))

for feature in feature_column_names:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])
df = df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['disease'].copy()
features_df = df.copy()
features_df = features_df.drop('disease', axis=1)
features_df = features_df / features_df.max()
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=0.01)
cart = CARTConstructor(min_samples_leaf=10, max_depth=6)
quest = QuestConstructor(default=1,
                         max_nr_nodes=1,
                         discrete_thresh=25,
                         alpha=0.05)
tree_constructors = [c45, cart, quest]

tree_confusion_matrices = {}
titles = ["C4.5", "Boosted C4.5", "Genetic"]
for title in titles:
    tree_confusion_matrices[title] = []

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'],
                                               n_folds=N_FOLDS,
                                               shuffle=True,
                                               random_state=SEED)
feature_maxs = {}
feature_column_names = list(set(df.columns) - set(['class']))

for feature in feature_column_names:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])
df = df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['class'].copy()
features_df = df.copy()
features_df = features_df.drop('class', axis=1)
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=0.95)
cart = CARTConstructor(max_depth=12, min_samples_leaf=2)
quest = QuestConstructor(default=1,
                         max_nr_nodes=1,
                         discrete_thresh=10,
                         alpha=0.99)
# c45 = C45Constructor(cf=0.75)
# cart = CARTConstructor(max_depth=10, min_samples_leaf=2)
# quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=10, alpha=0.9)
tree_constructors = [c45, cart, quest]

tree_confusion_matrices = {}
for tree_constructor in tree_constructors:
    tree_confusion_matrices[tree_constructor.get_name()] = []
tree_confusion_matrices["Genetic"] = []

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'],
#
# clusters = fclusterdata(data_df[["meanIntensity", "meanDuration"]], 0.1, criterion="distance")
# print clusters
label_df = DataFrame()
label_df["cat"] = features_df["diagnosis"]

features_df = features_df.drop("diagnosis", axis=1)
features_df = features_df.drop("id", axis=1)

best_features_boruta = boruta_py_feature_selection(features_df.values,
                                                   label_df['cat'].tolist(),
                                                   columns,
                                                   verbose=True,
                                                   percentile=80,
                                                   alpha=0.1)

num_features_boruta = len(best_features_boruta)

new_features_rf = DataFrame()
new_features_boruta = DataFrame()

for k in range(num_features_boruta):
    new_features_boruta[columns[best_features_boruta[k]]] = features_df[
        columns[best_features_boruta[k]]]

features_df_boruta = new_features_boruta

cart = CARTConstructor(min_samples_split=1)
tree = cart.construct_tree(new_features_boruta, labels=label_df)
tree.visualise("./test.pdf")
# data_df = data_df.dropna()
#
# clusters = fclusterdata(data_df[["meanIntensity", "meanDuration"]], 0.1, criterion="distance")
# print clusters
label_df = DataFrame()
label_df["cat"] = features_df["diagnosis"]

features_df = features_df.drop("diagnosis", axis=1)
features_df = features_df.drop("id", axis=1)

best_features_boruta = boruta_py_feature_selection(features_df.values, label_df['cat'].tolist(), columns,
                                                   verbose=True, percentile=80, alpha=0.1)


num_features_boruta = len(best_features_boruta)

new_features_rf = DataFrame()
new_features_boruta = DataFrame()


for k in range(num_features_boruta):
    new_features_boruta[columns[best_features_boruta[k]]] = features_df[columns[best_features_boruta[k]]]

features_df_boruta = new_features_boruta


cart = CARTConstructor(min_samples_split=1)
tree = cart.construct_tree(new_features_boruta, labels=label_df)
tree.visualise("./test.pdf")
best_features = RF_feature_selection(features_df.values, labels_df['cat'].tolist(), feature_column_names, verbose=True)
new_features = DataFrame()
for k in range(num_features):
    new_features[feature_column_names[best_features[k]]] = features_df[feature_column_names[best_features[k]]]
features_df = new_features
feature_column_names = list(set(features_df.columns) - set(['Survived']))

feature_mins = {}
feature_maxs = {}

for feature in feature_column_names:
        feature_mins[feature] = np.min(df[feature])
        feature_maxs[feature] = np.max(df[feature])

c45 = C45Constructor(cf=0.15)
cart = CARTConstructor(max_depth=3, min_samples_split=3)
quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=10, alpha=0.1)
tree_constructors = [c45, cart, quest]

merger = DecisionTreeMerger()
train_df = features_df.copy()
train_df['cat'] = labels_df['cat'].copy()
best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=10,
                                     num_mutations=5, population_size=10, max_samples=1, val_fraction=0.25,
                                     num_boosts=7)
# best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=1,
#                                      num_mutations=1, population_size=1, max_samples=1, val_fraction=0.05)
c45_tree = c45.construct_tree(features_df, labels_df)
quest_tree = quest.construct_tree(features_df, labels_df)
c45_tree.populate_samples(features_df, labels_df['cat'])
quest_tree.populate_samples(features_df, labels_df['cat'])
Beispiel #7
0
#     train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
#     train_features_df = train_features_df.reset_index(drop=True)
#     test_features_df = test_features_df.reset_index(drop=True)
#     train_labels_df = train_labels_df.reset_index(drop=True)
#     test_labels_df = test_labels_df.reset_index(drop=True)

# train_features_df = features_df.head(int(0.8*len(features_df.index)))
# test_features_df = features_df.tail(int(0.2*len(features_df.index)))
# train_labels_df = labels_df.head(int(0.8*len(labels_df.index)))
# test_labels_df = labels_df.tail(int(0.2*len(labels_df.index)))

train_df = train_features_df.copy()
train_df['cat'] = train_labels_df['cat'].copy()

c45 = C45Constructor(cf=0.15)
cart = CARTConstructor(min_samples_leaf=10)
#c45_2 = C45Constructor(cf=0.15)
#c45_3 = C45Constructor(cf=0.75)
quest = QuestConstructor(default=1,
                         max_nr_nodes=5,
                         discrete_thresh=10,
                         alpha=0.1)
tree_constructors = [c45, cart, quest]
trees = []
for tree_constructor in tree_constructors:
    tree = tree_constructor.construct_tree(train_features_df, train_labels_df)
    tree.visualise(
        os.path.join(os.path.join('..', 'data'), tree_constructor.get_name()))
    trees.append(tree)

merger = DecisionTreeMerger()