def test_split_X_y_output():
    dataset = [[10000, 50, 10, 100], [5000, 100, 20, 35], [1000, 10, 0, 12]]
    data_manager = DataManager(dataset)
    dataset = pd.DataFrame(dataset)
    X, y = data_manager.split_X_y(-1)
    assert np.array_equal(X, dataset.iloc[:, :-1].values) and np.array_equal(
        y, dataset.iloc[:, 3].values)
def test_shuffle_dataset_result():
    dataset = [[10000, 50, 10, 100], [5000, 100, 20, 35], [1000, 10, 0, 12],
               [10000, 50, 10, 100], [5000, 100, 20, 35], [1000, 10, 0, 12],
               [1000, 10], [4234, 12], [12113, 9]]

    data_manager = DataManager(dataset)
    data_manager.shuffle_dataset()
    assert not np.array_equal(data_manager.dataset, pd.DataFrame(dataset))
def test_create_encoder_output():
    labels = ['a', 'b', 'c', 'c']
    expected_label_encoder = LabelEncoder()
    expected_output = expected_label_encoder.fit(labels)

    data_manager = DataManager([])
    actual_output = data_manager.create_encoder(labels)
    assert np.array_equal(expected_output.transform(labels),
                          actual_output.transform(labels))
def test_add_column_result():
    dataset = [[10000, 50], [5000, 100], [1000, 10]]
    column = [1, 2, 3]

    expected_result = pd.DataFrame([[10000, 50, 1], [5000, 100, 2],
                                    [1000, 10, 3]])
    data_manager = DataManager(dataset)
    data_manager.add_column(column, "Prediction")
    assert np.array_equal(expected_result.values, data_manager.dataset.values)
def test_preprocess_data_output():
    dataset = [[10000.0, 50.0, 10.0, 100.0], [5000.0, 100.0, 20.0, 35.0],
               [1000.0, 10.0, 0.0, 12.0]]
    data_manager = DataManager(dataset)
    dataset = pd.DataFrame(dataset)
    sc = StandardScaler()
    output_X = sc.fit_transform(dataset.iloc[:, :-1].values)
    X, y = data_manager.preprocess_data(-1)
    assert np.array_equal(X, output_X) and np.array_equal(
        y, dataset.iloc[:, 3].values)
def test_split_train_test_output():
    X = [[10000, 50], [5000, 100], [1000, 10], [4234, 12], [12113, 9]]
    y = [2, 3, 4, 5, 6]

    data_manager = DataManager([])

    X_train, y_train, X_test, y_test = data_manager.split_train_test(
        X, y, test_size=0.25)

    expected_X_train = X[1:]
    expected_y_train = y[1:]

    expected_X_test = [X[0]]
    expected_y_test = [y[0]]

    assert np.array_equal(expected_X_train, X_train)
    assert np.array_equal(expected_y_train, y_train)

    assert np.array_equal(expected_X_test, X_test)
    assert np.array_equal(expected_y_test, y_test)
def test_split_train_trest_value_error():
    test_size_lt_zero = -1
    test_size_gt_one = 2
    data_manager = DataManager([])
    with pytest.raises(ValueError):
        data_manager.split_train_test([], [], test_size_lt_zero)
    with pytest.raises(ValueError):
        data_manager.split_train_test([], [], test_size_gt_one)
def test_preprocess_data_empty_array_exception():
    data_manager = DataManager([])
    with pytest.raises(EmptyDataset):
        data_manager.preprocess_data(0)
 def __init__(self):
     self.data_manager = DataManager([])
     self.X = None
     self.y = None
class Program:
    def __init__(self):
        self.data_manager = DataManager([])
        self.X = None
        self.y = None

    def main(self, args):
        if not (args.arbol ^ args.red_neuronal):
            raise ValueError("solo se puede elegir un modelo a la vez")
        print(args.arbol, args.red_neuronal)
        dataset_path = DATASETS_DIRECTORY + '/' + args.prefijo
        print('opening ' + dataset_path)
        dataset = csv_to_dataset(dataset_path)
        self.data_manager.dataset = dataset
        self.data_manager.shuffle_dataset()

        test_size = args.porcentaje_pruebas
        self.X, self.y = self.data_manager.preprocess_data(
            args.indice_columna_y)

        err_t = 0
        err_v = 0

        if (args.arbol):
            err_t, err_v = self.create_random_forest(args.umbral_poda,
                                                     test_size)
            prediction_path = DATASETS_DIRECTORY + \
                '/random_forest_predictions_' + args.prefijo
            dataset_to_csv(prediction_path, self.data_manager.dataset)

        elif args.red_neuronal:  # neural network
            layers = args.numero_capas
            neurons_hidden_layer = args.unidades_por_capa
            activation_func = args.funcion_activacion
            epochs = args.iteraciones_optimizador
            output_activation_func = args.funcion_activacion_salida
            err_t, err_v = self.process_neural_network(layers,
                                                       neurons_hidden_layer,
                                                       activation_func,
                                                       output_activation_func,
                                                       test_size, epochs)
            prediction_path = DATASETS_DIRECTORY + \
                '/neural_network_predictions_' + args.prefijo
            dataset_to_csv(prediction_path, self.data_manager.dataset)

        print('Error de entrenamiento: ' + str(err_t) + '\n' +
              'Error de pruebas: ' + str(err_v) + '\n')

    def create_random_forest(self, prune_gain, test_size):
        BM = BinningManager()
        self.X = self.X.tolist()
        self.y = self.y.tolist()
        BM.binning_data(self.X)
        random_forest = RandomForest(0)
        X_train, y_train, X_test, y_test = self.data_manager.split_train_test(
            self.X, self.y, test_size)
        cvm = CrossValidationManager(random_forest, X_train, y_train,
                                     l0_1_loss)
        random_forest = cvm.cross_validation_wrapper()
        err_t = cvm.err_t[cvm.learner.size]
        print("Size del árbol: " + str(cvm.learner.size))
        print("Error de entrenamiento(CV): " + str(err_t))
        err_v = cvm.error_rate(X_test, y_test)
        print("Error de pruebas(CV): " + str(err_v))
        if prune_gain > 0.0:
            for tree in random_forest.trees:
                tree.prune(prune_gain)
        err_t = cvm.error_rate(X_train, y_train)
        err_v = cvm.error_rate(X_test, y_test)
        f = open('logs/log_random_forest.txt', 'w')
        f.write(cvm.log)
        f.close()

        predictions = self.predictions_list(cvm.learner, self.X)
        self.data_manager.add_column(predictions, column_name='predictions')

        return err_t, err_v

    def process_neural_network(self, layers, neurons_hidden_layer,
                               activation_func, output_activation_func,
                               test_size, epochs):

        label_encoder = self.data_manager.create_encoder(self.y)
        self.y = label_encoder.transform(self.y)
        X_train, y_train, X_test, y_test = self.data_manager.split_train_test(
            self.X, self.y, test_size)
        activation_func = self.tf_activation_function(activation_func)
        output_activation_func = self.tf_activation_function(
            output_activation_func)

        neurons_output_layer = len(label_encoder.classes_)
        neural_network = NeuralNetwork(layers,
                                       neurons_hidden_layer,
                                       neurons_output_layer,
                                       activation_func,
                                       output_activation_func,
                                       epochs=epochs)
        cvm = CrossValidationManager(neural_network, X_train, y_train,
                                     l0_1_loss)
        err_t, _ = cvm.cross_validation()
        err_v = cvm.error_rate(X_test, y_test)

        predictions = self.predictions_list(cvm.learner, self.X)
        predictions = label_encoder.inverse_transform(predictions)
        self.data_manager.add_column(predictions, column_name='predictions')

        return err_t, err_v

    def tf_activation_function(self, activation_func):
        if activation_func == 'relu':
            return tf.nn.relu
        if activation_func == 'softmax':
            return tf.nn.softmax
        if activation_func == 'softplus':
            return tf.nn.softplus
        if activation_func == 'sigmoid':
            return tf.nn.sigmoid
        raise ValueError(
            activation_func +
            'is not an activation function or is not implemented yet')

    def predictions_list(self, learner, X):
        predictions = []
        for x in X:
            prediction = learner.predict(x)
            predictions.append(prediction)
        return predictions