예제 #1
0
def estimate_training_iterations(n_iterations=10,
                                 learning_rate_range=tuple(
                                     [0.001, 0.01, 0.1, 1.0])):
    data = load_higgs_train()

    def estimate_error(nn):
        error_data = []
        for i in range(n_iterations):
            nn.train(train_epoch=1)
            total_epochs, trn, tst = nn.estimate_error()
            error_data.append([nn.learning_rate, total_epochs, trn, tst])
        err_df = pd.DataFrame.from_records(error_data,
                                           columns=[
                                               'learning_rate', 'iteration',
                                               'training_error', 'test_error'
                                           ])
        return err_df

    dfs = [estimate_error(NeuralNetwork(data, l)) for l in learning_rate_range]

    df = pd.concat(dfs)

    df['training_accuracy'] = 1 - df['training_error'] / 100
    df['test_accuracy'] = 1 - df['test_error'] / 100
    return df
예제 #2
0
def estimate_best_power():
    """
    Run KNN classifier with multiple settings of
    power parameter for distance metric
    """
    p_range = {1: 'manhattan', 2: 'euclidean', 3: 'minkowski'}
    data = load_higgs_train()
    records = [[p_range[p]] + list(run_knn(data=data, power_parameter=p))
               for p in p_range]
    columns = ['metric', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #3
0
def estimate_dataset_size():
    data = load_higgs_train()
    features, weights, labels = data
    records = []
    for n in range(1, 10):
        f = features[:n * len(features)/10]
        w = weights[:n * len(weights)/10]
        l = labels[:n * len(labels)/10]
        records.append([len(f)] + list(run_svm((f, w, l))))
    columns = ['sample_size', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #4
0
def estimate_best_learning_rate():
    """
    Run Ada Boost classifier with multiple settings of
    learning_rate and plot the accuracy function of learning rate
    :return: the best learning rate setting
    """
    learning_rate_range = np.arange(0.2, 2.0, 0.2)
    data = load_higgs_train()
    records = [[rate] + list(run_AdaBoost(data=data, learning_rate=rate))
               for rate in learning_rate_range]
    columns = ['learning_rate', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #5
0
def estimate_best_n_neighbours():
    """
    Run KNN classifier with multiple settings of
    n_neighbours and plot the accuracy function of n_neighbours
    :return: the best n_neighbours setting
    """
    n_neighbours_range = np.arange(1, 26, 2)
    data = load_higgs_train()
    records = [[n_neighbours] + list(run_knn(data=data, n_neighbours=n_neighbours))
               for n_neighbours in n_neighbours_range]
    columns = ['n_neighbours', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #6
0
def estimate_best_learning_rate():
    """
    Run Ada Boost classifier with multiple settings of
    learning_rate and plot the accuracy function of learning rate
    :return: the best learning rate setting
    """
    learning_rate_range = np.arange(0.2, 2.0, 0.2)
    data = load_higgs_train()
    records = [[rate] + list(run_AdaBoost(data=data, learning_rate=rate))
               for rate in learning_rate_range]
    columns = ['learning_rate', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #7
0
def estimate_best_n_estimators():
    """
    Run Ada Boost classifier with multiple settings of
    n_estimators and plot the accuracy function of n_estimators
    :return: the best n_estimators setting
    """
    n_estimators_range = np.arange(30, 120, 5)
    data = load_higgs_train()
    records = [[n_estimator] + list(run_AdaBoost(data=data, n_estimators=n_estimator))
               for n_estimator in n_estimators_range]
    columns = ['n_estimators', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #8
0
def estimate_best_n_estimators():
    """
    Run Ada Boost classifier with multiple settings of
    n_estimators and plot the accuracy function of n_estimators
    :return: the best n_estimators setting
    """
    n_estimators_range = np.arange(30, 120, 5)
    data = load_higgs_train()
    records = [[n_estimator] +
               list(run_AdaBoost(data=data, n_estimators=n_estimator))
               for n_estimator in n_estimators_range]
    columns = ['n_estimators', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #9
0
def estimate_hidden_units(hidden_units_range=xrange(1, 100), sample_size=None):
    data = load_higgs_train(sample_size=sample_size)

    def estimate_error(nn):
        nn.train()
        total_epochs, trn, tst = nn.estimate_error()
        return [nn.n_hidden_units, total_epochs, trn, tst]

    data = [estimate_error(NeuralNetwork(data, n_hidden_units=l)) for l in hidden_units_range]

    df = pd.DataFrame.from_records(data, columns=['hidden_units', 'iteration', 'training_error', 'test_error'])

    df['training_accuracy'] = 1 - df['training_error'] / 100
    df['test_accuracy'] = 1 - df['test_error'] / 100
    return df
예제 #10
0
def estimate_best_min_samples_split():
    """
    Run the decision tree classifier with multiple settings of
    min_sample_split and plot the accuracy function of min_sample_split
    :return: the best min_sample_split setting
    """
    min_split_range = xrange(2, 120, 2)
    data = load_higgs_train()
    records = [[min_sample] + list(run_decision_tree(data=data, criterion='gini', min_samples_split=min_sample))
               + list(run_decision_tree(data=data, criterion='entropy', min_samples_split=min_sample))
               for min_sample in min_split_range]
    columns = ['min_sample_split', 'gini_training_score', 'gini_test_score',
               'entropy_training_score', 'entropy_test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    return df
예제 #11
0
def estimate_best_c():
    """
    Run svm classifier with multiple settings of
    C and plot the accuracy function of C
    :return: the best C setting
    """
    c_range = [10**n for n in range(4)]
    data = load_higgs_train()
    records = [[c] + list(run_svm(data=data, regularization_term=c))
               for c in c_range]
    LOGGER.info('Performed evaluation of the C setting choice')
    columns = ['C', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    LOGGER.info(df)
    return df
예제 #12
0
def estimate_best_gamma():
    """
    Run svm classifier with multiple settings of
    gamma and plot the accuracy function of gamma
    :return: the best gamma setting
    """
    gamma_range = np.arange(0.0, 1.0, 0.2)
    data = load_higgs_train()
    records = [[gamma] + list(run_svm(data=data, gamma=gamma))
               for gamma in gamma_range]
    LOGGER.info('Performed evaluation of the gamma setting choice')
    columns = ['gamma', 'training_score', 'test_score']
    df = pd.DataFrame.from_records(records, columns=columns, index=columns[0])
    LOGGER.info(df)
    return df
예제 #13
0
def estimate_training_iterations(n_iterations=10, learning_rate_range=tuple([0.001, 0.01, 0.1, 1.0])):
    data = load_higgs_train()

    def estimate_error(nn):
        error_data = []
        for i in range(n_iterations):
            nn.train(train_epoch=1)
            total_epochs, trn, tst = nn.estimate_error()
            error_data.append([nn.learning_rate, total_epochs, trn, tst])
        err_df = pd.DataFrame.from_records(error_data, columns=['learning_rate', 'iteration', 'training_error', 'test_error'])
        return err_df

    dfs = [estimate_error(NeuralNetwork(data, l)) for l in learning_rate_range]

    df = pd.concat(dfs)

    df['training_accuracy'] = 1 - df['training_error'] / 100
    df['test_accuracy'] = 1 - df['test_error'] / 100
    return df
예제 #14
0
def estimate_hidden_units(hidden_units_range=xrange(1, 100), sample_size=None):
    data = load_higgs_train(sample_size=sample_size)

    def estimate_error(nn):
        nn.train()
        total_epochs, trn, tst = nn.estimate_error()
        return [nn.n_hidden_units, total_epochs, trn, tst]

    data = [
        estimate_error(NeuralNetwork(data, n_hidden_units=l))
        for l in hidden_units_range
    ]

    df = pd.DataFrame.from_records(
        data,
        columns=['hidden_units', 'iteration', 'training_error', 'test_error'])

    df['training_accuracy'] = 1 - df['training_error'] / 100
    df['test_accuracy'] = 1 - df['test_error'] / 100
    return df