Esempio n. 1
0
    def ejercicion_1():

        # 1. Carga de datos
        data_voces = DataFrameUtils.read_cvs(
            "/mnt/c/Users/rtsz/Learning/promidat_ml/module_3/week_3/homework/src/data/voces.csv",
            delimiter=",",
            decimal=".",
            index_col=None)
        qualityModel = QualityModel(data_voces)
        qualityModel.distribution_predict_variable("genero")

        # 2. Random Forest
        randomForestModel = RandomForestModel(data_voces)
        randomForestModel.build_model(variable_predict="genero",
                                      train_size=0.8)
        randomForestModel.train_model(metadata={})

        # 2. ADA Boosting
        adaBoostingModel = ADABoostingModel(data_voces)
        adaBoostingModel.build_model(variable_predict="genero", train_size=0.8)
        adaBoostingModel.train_model(metadata={})

        # 2. ADA Boosting
        xgBoostingModel = XGBoostingModel(data_voces)
        xgBoostingModel.build_model(variable_predict="genero", train_size=0.8)
        xgBoostingModel.train_model(metadata={})

        randomForestModel.print_testing_info()
        randomForestModel.print_indexes_info()
        adaBoostingModel.print_indexes_info()
        xgBoostingModel.print_indexes_info()

        # 2. Matriz de Comparación
        print("Matriz de Comparación")
        indexes = DataFrameUtils.create_dataframe_from_dict(
            [
                randomForestModel.indexes.get_indexes_dictionary(),
                adaBoostingModel.indexes.get_indexes_dictionary(),
                xgBoostingModel.indexes.get_indexes_dictionary()
            ],
            columns=["Random Forest", "ADA Boosting", "XG Boosting"])
        print(indexes)
Esempio n. 2
0
 def ejercicio_4():
     data = DataFrameUtils.read_cvs(
         "/mnt/c/Users/rtsz/Learning/promidat_ml/module_3/week_3/homework/src/data/voces.csv",
         delimiter=",",
         decimal=".",
         index_col=None)
     X = data.loc[:, data.columns != "genero"]
     Y = data.loc[:, data.columns == "genero"]
     print("\nVariables Predictoras:\n")
     print(X.head())
     print("\nVariable a Predecir:\n")
     print(Y.head())
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         Y,
                                                         train_size=0.7,
                                                         random_state=0)
     model = ConcensoPropio()
     model.fit(X_train=X_train, y_train=y_train)
     model.predict(X_test, y_test)
     model.print_indexes_info()
Esempio n. 3
0
from utils.DataFrameUtils import DataFrameUtils
from methodology.supervised.classificators.MLPerceptronModel import MLPerceptronModel
from methodology.supervised.classificators.KNeighborsModel import KNeighborsModel
from methodology.supervised.classificators.ADABoostingModel import ADABoostingModel
from methodology.supervised.classificators.XGBoostingModel import XGBoostingModel
from methodology.supervised.classificators.RandomForestModel import RandomForestModel
from methodology.supervised.classificators.SVMModel import SVMModel
from methodology.supervised.classificators.DecisionTreeModel import DecisionTreeModel
from methodology.supervised.classificators.KerasModel import KerasModel

tumores_path = "/Users/rsalazar/Development/learning/machine_learning/promidat/src/data/tumores.csv"
data_tumores = DataFrameUtils.read_cvs(tumores_path,
                                       delimiter=",",
                                       decimal=".",
                                       index_col=0)
data_tumores['tipo'] = data_tumores['tipo'].astype('category')
print(data_tumores)

# 1. Utilizando el modelo MLPClassifier
mlpModel = MLPerceptronModel(data_tumores)
mlpModel.build_model(variable_predict="tipo", train_size=0.7)
mlpModel.print_testing_info()
mlpModel.train_model(metadata={"hidden_layer_sizes": (1000, 500)})
mlpModel.print_indexes_info()

# 1. Utilizando el modelo Tensorflow/Keras

kerasModel = KerasModel(data_tumores)
kerasModel.build_model(variable_predict="tipo", train_size=0.7)
kerasModel.print_testing_info()
kerasModel.train_model(metadata={})
class DataFrameUtilsTest(TestCase):
    def setUp(self):
        self.dataFrame = DataFrameUtils(spark, sc=sc)
        self.path = "/data/master/pctk/data/t_pctk_rcc_balance"
        self.process_date = "2020-07-12"

    def test__search_extension(self):
        file = "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro"
        extension = self.dataFrame._DataFrameUtils__search_extension(file)
        expected_extension = "avro"

        self.assertEqual(extension, expected_extension)

    def test__extract_extension(self):
        files = [
            "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro",
            "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro"
        ]
        extension = self.dataFrame._DataFrameUtils__extract_extension(files)
        expected_extension = "avro"

        self.assertEqual(extension, expected_extension)

    def test__extract_extension_with_invalid_extension(self):
        files = [
            "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro",
            "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.txt"
        ]

        with self.assertRaises(Exception) as error:
            self.dataFrame._DataFrameUtils__extract_extension(files)

        exception = error.exception
        self.assertEqual(exception.args[0], "Extension doesn't recognized!")

    def test_get_format_type_from_path(self):
        path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/"
        extension = self.dataFrame.get_format_type_from_path(path)
        expected_extension = "parquet"

        self.assertEqual(extension, expected_extension)

    def test__get_format_type(self):
        path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/"
        extension = self.dataFrame._DataFrameUtils__get_format_type(path)
        expected_extension = "parquet"

        self.assertEqual(extension, expected_extension)

    def test__extract_path(self):
        path = self.dataFrame._DataFrameUtils__extract_path(self.path)
        expected_path = "hdfs://pedaaswork.scmx2p100.isi/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"

        self.assertEqual(path, expected_path)

    def test__extract_path_with_partition_name(self):
        path = self.dataFrame._DataFrameUtils__extract_path(self.path + "/cutoff_date=2020-04-30/")
        expected_path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/"

        self.assertEqual(path, expected_path)

    def test__extract_path_with_paths(self):
        paths = [
            "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/",
            "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-05-31/"
        ]
        path = self.dataFrame._DataFrameUtils__extract_path(paths=paths)
        expected_path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/"

        self.assertEqual(path, expected_path)

    def test__concat_path_with_partition_name(self):
        date_partitions = [
            "2020-04-30"
        ]
        path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date="

        partitions = self.dataFrame._DataFrameUtils__concat_path_with_partition_name(date_partitions, path)

        expected_partition = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30"]

        self.assertEqual(partitions, expected_partition)

    def test__add_partition_name(self):
        path = "/data/master/pctk/data/t_pctk_rcc_balance/"
        path = self.dataFrame._DataFrameUtils__add_partition_name(path)

        expected_path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date="

        self.assertEqual(path, expected_path)

    def test__get_paths_with_process_name(self):
        paths = self.dataFrame._DataFrameUtils__get_paths_with_process_name(self.path, partition_number=2)
        expected_paths = [
            "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31",
            "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30"
        ]

        self.assertEqual(paths, expected_paths)

    def test_read_dataframe(self):
        dataframe = self.dataFrame.read_dataframe(self.path)
        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)

    def test_read_dataframe_with_path(self):
        paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30",
                 "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"]

        dataframe = self.dataFrame.read_dataframe(paths=paths)
        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)

    def test_read_dataframe_with_path_retrieving_partition_name(self):
        paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30",
                 "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"]

        dataframe = self.dataFrame.read_dataframe(paths=paths,
                                                  options={'basePath': self.path})

        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)
        self.assertTrue("cutoff_date" in dataframe.schema.names)

    def test_read_dataframes(self):
        dataframe = self.dataFrame.read_dataframes(self.path, partition_number=1)
        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)

    def test_read_dataframes_with_date_range(self):
        dataframe = self.dataFrame.read_dataframes(self.path, process_date=["2020-05-31", "2020-07-31"],
                                                   options={"basePath": self.path})

        empty_dataframe = sqlContext.createDataFrame([], StructType([]))
        dates = dataframe.select("cutoff_date").dropDuplicates().collect()
        expected_dates = [Row(cutoff_date=datetime.date(2020, 7, 31)),
                          Row(cutoff_date=datetime.date(2020, 5, 31)),
                          Row(cutoff_date=datetime.date(2020, 6, 30))]

        self.assertNotEqual(dataframe, empty_dataframe)
        self.assertEqual(dates, expected_dates)
 def setUp(self):
     self.dataFrame = DataFrameUtils(spark, sc=sc)
     self.path = "/data/master/pctk/data/t_pctk_rcc_balance"
     self.process_date = "2020-07-12"
Esempio n. 6
0
import math
import numpy as np
from utils.DataFrameUtils import DataFrameUtils

data = DataFrameUtils.create_dataframe_from_dict({
    "x1": [1, 1, 1, 1],
    "x2": [0, 0, 1, 1],
    "x3": [0, 1, 0, 1],
    "z": [1, 1, 1, 0]
})


def verify_values(array1, array2):
    return (array1 == array2).all()


def perceptron(x1, x2, x3, w1, w2, w3, t):
    return (x1 * w1 + x2 * w2 + x3 * w3 - t)


def tangente_hiperbolica(x1, x2, x3, w1, w2, w3, t):
    val = (2 / (1 + math.exp(-2 * perceptron(x1, x2, x3, w1, w2, w3, t)))) - 1
    return 1 if val >= 0 else 0


weights = [x * 0.1 for x in range(-10, 11)]
thetas = [x * 0.1 for x in range(0, 11)]

success_values = list()

for t in thetas:
Esempio n. 7
0
    t = random.randint(-5, 5)
    return (w1, w2, t)


def verify_values(array1, array2):
    return (array1 == array2).all()


def sigmoidea(x1, x2, w1, w2, t):
    percept = 1 / (1 + math.exp(-1 * (x1 * w1 + x2 * w2 - t)))
    return 1 if percept >= 0.5 else 0


data = DataFrameUtils.create_dataframe_from_dict({
    "x1": [0, 1, 0, 1],
    "x2": [0, 0, 1, 1],
    "y": [1, 1, 1, 0]
})
print(data)

while True:
    w1, w2, t = get_random_numbers()
    col = data.apply(lambda row: sigmoidea(row["x1"], row["x2"], w1, w2, t),
                     axis=1)
    print(f"Testing: w1 = {w1}, w2 = {w2}, t = {t}")
    if verify_values(data["y"], col):
        print("Success...")
        break

print("\nRequired values: ")
print(f"\tw1 = {w1}\n\tw2 = {w2}\n\tt  = {t}")