Exemple #1
0
def get_best_quest_classifier(train, label_col, skf_tune):
    quest = QuestConstructor()
    alphas = [10e-5, 10e-4, 10e-3, 10e-2, 0.25, 0.5, 0.9,
              0.99]  #np.arange(0.001, 1, 0.01)
    # max_nr_nodes = np.arange(1,20,2)

    errors = {}
    for alpha in alphas:
        # for max_nr_node in max_nr_nodes:
        errors[alpha] = []

    for train_tune_idx, val_tune_idx in skf_tune:
        train_tune = train.iloc[train_tune_idx, :]
        X_train_tune = train_tune.drop(label_col, axis=1)
        y_train_tune = train_tune[label_col]
        val_tune = train.iloc[val_tune_idx, :]
        X_val_tune = val_tune.drop(label_col, axis=1)
        y_val_tune = val_tune[label_col]
        for alpha in alphas:
            quest.alpha = alpha
            tree = quest.construct_tree(X_train_tune, y_train_tune)
            predictions = tree.evaluate_multiple(X_val_tune).astype(int)
            errors[alpha].append(
                1 - accuracy_score(predictions, y_val_tune, normalize=True))

    for alpha in alphas:
        # for max_nr_node in max_nr_nodes:
        errors[alpha] = np.mean(errors[alpha])

    best_params = min(errors.items(), key=operator.itemgetter(1))[0]
    quest.alpha = best_params

    return quest
for feature in feature_column_names:
    feature_mins[feature] = np.min(df[feature])
    feature_maxs[feature] = np.max(df[feature])
df = df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['disease'].copy()
features_df = df.copy()
features_df = features_df.drop('disease', axis=1)
features_df = features_df / features_df.max()
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=0.01)
cart = CARTConstructor(min_samples_leaf=10, max_depth=6)
quest = QuestConstructor(default=1,
                         max_nr_nodes=1,
                         discrete_thresh=25,
                         alpha=0.05)
tree_constructors = [c45, cart, quest]

tree_confusion_matrices = {}
titles = ["C4.5", "Boosted C4.5", "Genetic"]
for title in titles:
    tree_confusion_matrices[title] = []

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'],
                                               n_folds=N_FOLDS,
                                               shuffle=True,
                                               random_state=SEED)

for train_index, test_index in skf:
    trees = []
feature_column_names = list(set(df.columns) - set(['Name']))

for feature in feature_column_names:
        feature_mins[feature] = np.min(df[feature])
        feature_maxs[feature] = np.max(df[feature])
df=df.reset_index(drop=True)
labels_df = DataFrame()
labels_df['cat'] = df['Name'].copy()
features_df = df.copy()
features_df = features_df.drop('Name', axis=1)
train_labels_df = labels_df
train_features_df = features_df

c45 = C45Constructor(cf=1.0)
cart = CARTConstructor(max_depth=5, min_samples_leaf=2)
quest = QuestConstructor(default=1, max_nr_nodes=2, discrete_thresh=1, alpha=0.0000001)
tree_constructors = [c45, cart, quest]

tree_confusion_matrices = {}
for tree_constructor in tree_constructors:
    tree_confusion_matrices[tree_constructor.get_name()] = []
tree_confusion_matrices["Genetic"] = []

skf = sklearn.cross_validation.StratifiedKFold(labels_df['cat'], n_folds=N_FOLDS, shuffle=True, random_state=SEED)

for train_index, test_index in skf:
    train_features_df, test_features_df = features_df.iloc[train_index,:].copy(), features_df.iloc[test_index,:].copy()
    train_labels_df, test_labels_df = labels_df.iloc[train_index,:].copy(), labels_df.iloc[test_index,:].copy()
    train_features_df = train_features_df.reset_index(drop=True)
    test_features_df = test_features_df.reset_index(drop=True)
    train_labels_df = train_labels_df.reset_index(drop=True)
new_features = DataFrame()
for k in range(num_features):
    new_features[feature_column_names[best_features[k]]] = features_df[feature_column_names[best_features[k]]]
features_df = new_features
feature_column_names = list(set(features_df.columns) - set(['Survived']))

feature_mins = {}
feature_maxs = {}

for feature in feature_column_names:
        feature_mins[feature] = np.min(df[feature])
        feature_maxs[feature] = np.max(df[feature])

c45 = C45Constructor(cf=0.15)
cart = CARTConstructor(max_depth=3, min_samples_split=3)
quest = QuestConstructor(default=1, max_nr_nodes=3, discrete_thresh=10, alpha=0.1)
tree_constructors = [c45, cart, quest]

merger = DecisionTreeMerger()
train_df = features_df.copy()
train_df['cat'] = labels_df['cat'].copy()
best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=10,
                                     num_mutations=5, population_size=10, max_samples=1, val_fraction=0.25,
                                     num_boosts=7)
# best_tree = merger.genetic_algorithm(train_df, 'cat', tree_constructors, seed=SEED, num_iterations=1,
#                                      num_mutations=1, population_size=1, max_samples=1, val_fraction=0.05)
c45_tree = c45.construct_tree(features_df, labels_df)
quest_tree = quest.construct_tree(features_df, labels_df)
c45_tree.populate_samples(features_df, labels_df['cat'])
quest_tree.populate_samples(features_df, labels_df['cat'])
cart_tree = cart.construct_tree(features_df, labels_df)
Exemple #5
0
#     test_labels_df = test_labels_df.reset_index(drop=True)

# train_features_df = features_df.head(int(0.8*len(features_df.index)))
# test_features_df = features_df.tail(int(0.2*len(features_df.index)))
# train_labels_df = labels_df.head(int(0.8*len(labels_df.index)))
# test_labels_df = labels_df.tail(int(0.2*len(labels_df.index)))

train_df = train_features_df.copy()
train_df['cat'] = train_labels_df['cat'].copy()

c45 = C45Constructor(cf=0.15)
cart = CARTConstructor(min_samples_leaf=10)
#c45_2 = C45Constructor(cf=0.15)
#c45_3 = C45Constructor(cf=0.75)
quest = QuestConstructor(default=1,
                         max_nr_nodes=5,
                         discrete_thresh=10,
                         alpha=0.1)
tree_constructors = [c45, cart, quest]
trees = []
for tree_constructor in tree_constructors:
    tree = tree_constructor.construct_tree(train_features_df, train_labels_df)
    tree.visualise(
        os.path.join(os.path.join('..', 'data'), tree_constructor.get_name()))
    trees.append(tree)

merger = DecisionTreeMerger()
best_tree, constructed_trees = merger.genetic_algorithm(train_df,
                                                        'cat',
                                                        tree_constructors,
                                                        seed=SEED,
                                                        num_iterations=2)
Exemple #6
0
            measurements_temp[dataset][algorithm] = measurements[dataset][algorithm]
    measurements_list.append(measurements_temp)

    target.write(write_preamble())
    for measurements_ in measurements_list:
        target.write(write_measurements(measurements_))
    target.write(write_figures(figure))
    target.write(write_footing())

    target.close()

datasets = load_all_datasets()

quest_bench = QUESTBenchConstructor()
guide = GUIDEConstructor()
quest = QuestConstructor()
inTrees = inTreesClassifier()
merger = DecisionTreeMergerClean()
NR_FOLDS = 3
for dataset in datasets:
    print dataset['name'], len(dataset['dataframe'])
    conf_matrices = {'QUESTGilles': [], 'GUIDE': [], 'C4.5': [], 'CART': [], 'ISM': [], 'ISM_pruned': [],
                     'Genetic': [], 'CN2': [], 'QUESTLoh': [], 'inTrees': [], 'XGBoost': [], 'RF': []}  #
    avg_nodes = {'QUESTGilles': [], 'GUIDE': [], 'C4.5': [], 'CART': [], 'ISM': [], 'ISM_pruned': [],
                 'Genetic': [], 'CN2': [], 'QUESTLoh': [], 'inTrees': [], 'XGBoost': [], 'RF': []}  #
    times = {'QUESTGilles': [], 'GUIDE': [], 'C4.5': [], 'CART': [], 'ISM': [], 'ISM_pruned': [],
                 'Genetic': [], 'CN2': [], 'QUESTLoh': [], 'inTrees': [], 'XGBoost': [], 'RF': []}  #
    df = dataset['dataframe']
    label_col = dataset['label_col']
    feature_cols = dataset['feature_cols']
    skf = StratifiedKFold(df[label_col], n_folds=NR_FOLDS, shuffle=True, random_state=None)