def estimate_training_iterations(n_iterations=10, learning_rate_range=tuple( [0.001, 0.01, 0.1, 1.0])): data = load_higgs_train() def estimate_error(nn): error_data = [] for i in range(n_iterations): nn.train(train_epoch=1) total_epochs, trn, tst = nn.estimate_error() error_data.append([nn.learning_rate, total_epochs, trn, tst]) err_df = pd.DataFrame.from_records(error_data, columns=[ 'learning_rate', 'iteration', 'training_error', 'test_error' ]) return err_df dfs = [estimate_error(NeuralNetwork(data, l)) for l in learning_rate_range] df = pd.concat(dfs) df['training_accuracy'] = 1 - df['training_error'] / 100 df['test_accuracy'] = 1 - df['test_error'] / 100 return df
def estimate_best_power(): """ Run KNN classifier with multiple settings of power parameter for distance metric """ p_range = {1: 'manhattan', 2: 'euclidean', 3: 'minkowski'} data = load_higgs_train() records = [[p_range[p]] + list(run_knn(data=data, power_parameter=p)) for p in p_range] columns = ['metric', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) return df
def estimate_dataset_size(): data = load_higgs_train() features, weights, labels = data records = [] for n in range(1, 10): f = features[:n * len(features)/10] w = weights[:n * len(weights)/10] l = labels[:n * len(labels)/10] records.append([len(f)] + list(run_svm((f, w, l)))) columns = ['sample_size', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) return df
def estimate_best_learning_rate(): """ Run Ada Boost classifier with multiple settings of learning_rate and plot the accuracy function of learning rate :return: the best learning rate setting """ learning_rate_range = np.arange(0.2, 2.0, 0.2) data = load_higgs_train() records = [[rate] + list(run_AdaBoost(data=data, learning_rate=rate)) for rate in learning_rate_range] columns = ['learning_rate', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) return df
def estimate_best_n_neighbours(): """ Run KNN classifier with multiple settings of n_neighbours and plot the accuracy function of n_neighbours :return: the best n_neighbours setting """ n_neighbours_range = np.arange(1, 26, 2) data = load_higgs_train() records = [[n_neighbours] + list(run_knn(data=data, n_neighbours=n_neighbours)) for n_neighbours in n_neighbours_range] columns = ['n_neighbours', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) return df
def estimate_best_n_estimators(): """ Run Ada Boost classifier with multiple settings of n_estimators and plot the accuracy function of n_estimators :return: the best n_estimators setting """ n_estimators_range = np.arange(30, 120, 5) data = load_higgs_train() records = [[n_estimator] + list(run_AdaBoost(data=data, n_estimators=n_estimator)) for n_estimator in n_estimators_range] columns = ['n_estimators', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) return df
def estimate_hidden_units(hidden_units_range=xrange(1, 100), sample_size=None): data = load_higgs_train(sample_size=sample_size) def estimate_error(nn): nn.train() total_epochs, trn, tst = nn.estimate_error() return [nn.n_hidden_units, total_epochs, trn, tst] data = [estimate_error(NeuralNetwork(data, n_hidden_units=l)) for l in hidden_units_range] df = pd.DataFrame.from_records(data, columns=['hidden_units', 'iteration', 'training_error', 'test_error']) df['training_accuracy'] = 1 - df['training_error'] / 100 df['test_accuracy'] = 1 - df['test_error'] / 100 return df
def estimate_best_min_samples_split(): """ Run the decision tree classifier with multiple settings of min_sample_split and plot the accuracy function of min_sample_split :return: the best min_sample_split setting """ min_split_range = xrange(2, 120, 2) data = load_higgs_train() records = [[min_sample] + list(run_decision_tree(data=data, criterion='gini', min_samples_split=min_sample)) + list(run_decision_tree(data=data, criterion='entropy', min_samples_split=min_sample)) for min_sample in min_split_range] columns = ['min_sample_split', 'gini_training_score', 'gini_test_score', 'entropy_training_score', 'entropy_test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) return df
def estimate_best_c(): """ Run svm classifier with multiple settings of C and plot the accuracy function of C :return: the best C setting """ c_range = [10**n for n in range(4)] data = load_higgs_train() records = [[c] + list(run_svm(data=data, regularization_term=c)) for c in c_range] LOGGER.info('Performed evaluation of the C setting choice') columns = ['C', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) LOGGER.info(df) return df
def estimate_best_gamma(): """ Run svm classifier with multiple settings of gamma and plot the accuracy function of gamma :return: the best gamma setting """ gamma_range = np.arange(0.0, 1.0, 0.2) data = load_higgs_train() records = [[gamma] + list(run_svm(data=data, gamma=gamma)) for gamma in gamma_range] LOGGER.info('Performed evaluation of the gamma setting choice') columns = ['gamma', 'training_score', 'test_score'] df = pd.DataFrame.from_records(records, columns=columns, index=columns[0]) LOGGER.info(df) return df
def estimate_training_iterations(n_iterations=10, learning_rate_range=tuple([0.001, 0.01, 0.1, 1.0])): data = load_higgs_train() def estimate_error(nn): error_data = [] for i in range(n_iterations): nn.train(train_epoch=1) total_epochs, trn, tst = nn.estimate_error() error_data.append([nn.learning_rate, total_epochs, trn, tst]) err_df = pd.DataFrame.from_records(error_data, columns=['learning_rate', 'iteration', 'training_error', 'test_error']) return err_df dfs = [estimate_error(NeuralNetwork(data, l)) for l in learning_rate_range] df = pd.concat(dfs) df['training_accuracy'] = 1 - df['training_error'] / 100 df['test_accuracy'] = 1 - df['test_error'] / 100 return df
def estimate_hidden_units(hidden_units_range=xrange(1, 100), sample_size=None): data = load_higgs_train(sample_size=sample_size) def estimate_error(nn): nn.train() total_epochs, trn, tst = nn.estimate_error() return [nn.n_hidden_units, total_epochs, trn, tst] data = [ estimate_error(NeuralNetwork(data, n_hidden_units=l)) for l in hidden_units_range ] df = pd.DataFrame.from_records( data, columns=['hidden_units', 'iteration', 'training_error', 'test_error']) df['training_accuracy'] = 1 - df['training_error'] / 100 df['test_accuracy'] = 1 - df['test_error'] / 100 return df