def test_split_X_y_output(): dataset = [[10000, 50, 10, 100], [5000, 100, 20, 35], [1000, 10, 0, 12]] data_manager = DataManager(dataset) dataset = pd.DataFrame(dataset) X, y = data_manager.split_X_y(-1) assert np.array_equal(X, dataset.iloc[:, :-1].values) and np.array_equal( y, dataset.iloc[:, 3].values)
def test_shuffle_dataset_result(): dataset = [[10000, 50, 10, 100], [5000, 100, 20, 35], [1000, 10, 0, 12], [10000, 50, 10, 100], [5000, 100, 20, 35], [1000, 10, 0, 12], [1000, 10], [4234, 12], [12113, 9]] data_manager = DataManager(dataset) data_manager.shuffle_dataset() assert not np.array_equal(data_manager.dataset, pd.DataFrame(dataset))
def test_create_encoder_output(): labels = ['a', 'b', 'c', 'c'] expected_label_encoder = LabelEncoder() expected_output = expected_label_encoder.fit(labels) data_manager = DataManager([]) actual_output = data_manager.create_encoder(labels) assert np.array_equal(expected_output.transform(labels), actual_output.transform(labels))
def test_add_column_result(): dataset = [[10000, 50], [5000, 100], [1000, 10]] column = [1, 2, 3] expected_result = pd.DataFrame([[10000, 50, 1], [5000, 100, 2], [1000, 10, 3]]) data_manager = DataManager(dataset) data_manager.add_column(column, "Prediction") assert np.array_equal(expected_result.values, data_manager.dataset.values)
def test_preprocess_data_output(): dataset = [[10000.0, 50.0, 10.0, 100.0], [5000.0, 100.0, 20.0, 35.0], [1000.0, 10.0, 0.0, 12.0]] data_manager = DataManager(dataset) dataset = pd.DataFrame(dataset) sc = StandardScaler() output_X = sc.fit_transform(dataset.iloc[:, :-1].values) X, y = data_manager.preprocess_data(-1) assert np.array_equal(X, output_X) and np.array_equal( y, dataset.iloc[:, 3].values)
def test_split_train_test_output(): X = [[10000, 50], [5000, 100], [1000, 10], [4234, 12], [12113, 9]] y = [2, 3, 4, 5, 6] data_manager = DataManager([]) X_train, y_train, X_test, y_test = data_manager.split_train_test( X, y, test_size=0.25) expected_X_train = X[1:] expected_y_train = y[1:] expected_X_test = [X[0]] expected_y_test = [y[0]] assert np.array_equal(expected_X_train, X_train) assert np.array_equal(expected_y_train, y_train) assert np.array_equal(expected_X_test, X_test) assert np.array_equal(expected_y_test, y_test)
def test_split_train_trest_value_error(): test_size_lt_zero = -1 test_size_gt_one = 2 data_manager = DataManager([]) with pytest.raises(ValueError): data_manager.split_train_test([], [], test_size_lt_zero) with pytest.raises(ValueError): data_manager.split_train_test([], [], test_size_gt_one)
def test_preprocess_data_empty_array_exception(): data_manager = DataManager([]) with pytest.raises(EmptyDataset): data_manager.preprocess_data(0)
def __init__(self): self.data_manager = DataManager([]) self.X = None self.y = None
class Program: def __init__(self): self.data_manager = DataManager([]) self.X = None self.y = None def main(self, args): if not (args.arbol ^ args.red_neuronal): raise ValueError("solo se puede elegir un modelo a la vez") print(args.arbol, args.red_neuronal) dataset_path = DATASETS_DIRECTORY + '/' + args.prefijo print('opening ' + dataset_path) dataset = csv_to_dataset(dataset_path) self.data_manager.dataset = dataset self.data_manager.shuffle_dataset() test_size = args.porcentaje_pruebas self.X, self.y = self.data_manager.preprocess_data( args.indice_columna_y) err_t = 0 err_v = 0 if (args.arbol): err_t, err_v = self.create_random_forest(args.umbral_poda, test_size) prediction_path = DATASETS_DIRECTORY + \ '/random_forest_predictions_' + args.prefijo dataset_to_csv(prediction_path, self.data_manager.dataset) elif args.red_neuronal: # neural network layers = args.numero_capas neurons_hidden_layer = args.unidades_por_capa activation_func = args.funcion_activacion epochs = args.iteraciones_optimizador output_activation_func = args.funcion_activacion_salida err_t, err_v = self.process_neural_network(layers, neurons_hidden_layer, activation_func, output_activation_func, test_size, epochs) prediction_path = DATASETS_DIRECTORY + \ '/neural_network_predictions_' + args.prefijo dataset_to_csv(prediction_path, self.data_manager.dataset) print('Error de entrenamiento: ' + str(err_t) + '\n' + 'Error de pruebas: ' + str(err_v) + '\n') def create_random_forest(self, prune_gain, test_size): BM = BinningManager() self.X = self.X.tolist() self.y = self.y.tolist() BM.binning_data(self.X) random_forest = RandomForest(0) X_train, y_train, X_test, y_test = self.data_manager.split_train_test( self.X, self.y, test_size) cvm = CrossValidationManager(random_forest, X_train, y_train, l0_1_loss) random_forest = cvm.cross_validation_wrapper() err_t = cvm.err_t[cvm.learner.size] print("Size del árbol: " + str(cvm.learner.size)) print("Error de entrenamiento(CV): " + str(err_t)) err_v = cvm.error_rate(X_test, y_test) print("Error de pruebas(CV): " + str(err_v)) if prune_gain > 0.0: for tree in random_forest.trees: tree.prune(prune_gain) err_t = cvm.error_rate(X_train, y_train) err_v = cvm.error_rate(X_test, y_test) f = open('logs/log_random_forest.txt', 'w') f.write(cvm.log) f.close() predictions = self.predictions_list(cvm.learner, self.X) self.data_manager.add_column(predictions, column_name='predictions') return err_t, err_v def process_neural_network(self, layers, neurons_hidden_layer, activation_func, output_activation_func, test_size, epochs): label_encoder = self.data_manager.create_encoder(self.y) self.y = label_encoder.transform(self.y) X_train, y_train, X_test, y_test = self.data_manager.split_train_test( self.X, self.y, test_size) activation_func = self.tf_activation_function(activation_func) output_activation_func = self.tf_activation_function( output_activation_func) neurons_output_layer = len(label_encoder.classes_) neural_network = NeuralNetwork(layers, neurons_hidden_layer, neurons_output_layer, activation_func, output_activation_func, epochs=epochs) cvm = CrossValidationManager(neural_network, X_train, y_train, l0_1_loss) err_t, _ = cvm.cross_validation() err_v = cvm.error_rate(X_test, y_test) predictions = self.predictions_list(cvm.learner, self.X) predictions = label_encoder.inverse_transform(predictions) self.data_manager.add_column(predictions, column_name='predictions') return err_t, err_v def tf_activation_function(self, activation_func): if activation_func == 'relu': return tf.nn.relu if activation_func == 'softmax': return tf.nn.softmax if activation_func == 'softplus': return tf.nn.softplus if activation_func == 'sigmoid': return tf.nn.sigmoid raise ValueError( activation_func + 'is not an activation function or is not implemented yet') def predictions_list(self, learner, X): predictions = [] for x in X: prediction = learner.predict(x) predictions.append(prediction) return predictions