def ejercicion_1(): # 1. Carga de datos data_voces = DataFrameUtils.read_cvs( "/mnt/c/Users/rtsz/Learning/promidat_ml/module_3/week_3/homework/src/data/voces.csv", delimiter=",", decimal=".", index_col=None) qualityModel = QualityModel(data_voces) qualityModel.distribution_predict_variable("genero") # 2. Random Forest randomForestModel = RandomForestModel(data_voces) randomForestModel.build_model(variable_predict="genero", train_size=0.8) randomForestModel.train_model(metadata={}) # 2. ADA Boosting adaBoostingModel = ADABoostingModel(data_voces) adaBoostingModel.build_model(variable_predict="genero", train_size=0.8) adaBoostingModel.train_model(metadata={}) # 2. ADA Boosting xgBoostingModel = XGBoostingModel(data_voces) xgBoostingModel.build_model(variable_predict="genero", train_size=0.8) xgBoostingModel.train_model(metadata={}) randomForestModel.print_testing_info() randomForestModel.print_indexes_info() adaBoostingModel.print_indexes_info() xgBoostingModel.print_indexes_info() # 2. Matriz de Comparación print("Matriz de Comparación") indexes = DataFrameUtils.create_dataframe_from_dict( [ randomForestModel.indexes.get_indexes_dictionary(), adaBoostingModel.indexes.get_indexes_dictionary(), xgBoostingModel.indexes.get_indexes_dictionary() ], columns=["Random Forest", "ADA Boosting", "XG Boosting"]) print(indexes)
def ejercicio_4(): data = DataFrameUtils.read_cvs( "/mnt/c/Users/rtsz/Learning/promidat_ml/module_3/week_3/homework/src/data/voces.csv", delimiter=",", decimal=".", index_col=None) X = data.loc[:, data.columns != "genero"] Y = data.loc[:, data.columns == "genero"] print("\nVariables Predictoras:\n") print(X.head()) print("\nVariable a Predecir:\n") print(Y.head()) X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=0) model = ConcensoPropio() model.fit(X_train=X_train, y_train=y_train) model.predict(X_test, y_test) model.print_indexes_info()
from utils.DataFrameUtils import DataFrameUtils from methodology.supervised.classificators.MLPerceptronModel import MLPerceptronModel from methodology.supervised.classificators.KNeighborsModel import KNeighborsModel from methodology.supervised.classificators.ADABoostingModel import ADABoostingModel from methodology.supervised.classificators.XGBoostingModel import XGBoostingModel from methodology.supervised.classificators.RandomForestModel import RandomForestModel from methodology.supervised.classificators.SVMModel import SVMModel from methodology.supervised.classificators.DecisionTreeModel import DecisionTreeModel from methodology.supervised.classificators.KerasModel import KerasModel tumores_path = "/Users/rsalazar/Development/learning/machine_learning/promidat/src/data/tumores.csv" data_tumores = DataFrameUtils.read_cvs(tumores_path, delimiter=",", decimal=".", index_col=0) data_tumores['tipo'] = data_tumores['tipo'].astype('category') print(data_tumores) # 1. Utilizando el modelo MLPClassifier mlpModel = MLPerceptronModel(data_tumores) mlpModel.build_model(variable_predict="tipo", train_size=0.7) mlpModel.print_testing_info() mlpModel.train_model(metadata={"hidden_layer_sizes": (1000, 500)}) mlpModel.print_indexes_info() # 1. Utilizando el modelo Tensorflow/Keras kerasModel = KerasModel(data_tumores) kerasModel.build_model(variable_predict="tipo", train_size=0.7) kerasModel.print_testing_info() kerasModel.train_model(metadata={})
class DataFrameUtilsTest(TestCase): def setUp(self): self.dataFrame = DataFrameUtils(spark, sc=sc) self.path = "/data/master/pctk/data/t_pctk_rcc_balance" self.process_date = "2020-07-12" def test__search_extension(self): file = "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro" extension = self.dataFrame._DataFrameUtils__search_extension(file) expected_extension = "avro" self.assertEqual(extension, expected_extension) def test__extract_extension(self): files = [ "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro", "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro" ] extension = self.dataFrame._DataFrameUtils__extract_extension(files) expected_extension = "avro" self.assertEqual(extension, expected_extension) def test__extract_extension_with_invalid_extension(self): files = [ "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.avro", "data/raw/part-00001-59364dc0-fc95-4467-851a-d1667a8be31d.c000.txt" ] with self.assertRaises(Exception) as error: self.dataFrame._DataFrameUtils__extract_extension(files) exception = error.exception self.assertEqual(exception.args[0], "Extension doesn't recognized!") def test_get_format_type_from_path(self): path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/" extension = self.dataFrame.get_format_type_from_path(path) expected_extension = "parquet" self.assertEqual(extension, expected_extension) def test__get_format_type(self): path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/" extension = self.dataFrame._DataFrameUtils__get_format_type(path) expected_extension = "parquet" self.assertEqual(extension, expected_extension) def test__extract_path(self): path = self.dataFrame._DataFrameUtils__extract_path(self.path) expected_path = "hdfs://pedaaswork.scmx2p100.isi/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31" self.assertEqual(path, expected_path) def test__extract_path_with_partition_name(self): path = self.dataFrame._DataFrameUtils__extract_path(self.path + "/cutoff_date=2020-04-30/") expected_path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/" self.assertEqual(path, expected_path) def test__extract_path_with_paths(self): paths = [ "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/", "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-05-31/" ] path = self.dataFrame._DataFrameUtils__extract_path(paths=paths) expected_path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30/" self.assertEqual(path, expected_path) def test__concat_path_with_partition_name(self): date_partitions = [ "2020-04-30" ] path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=" partitions = self.dataFrame._DataFrameUtils__concat_path_with_partition_name(date_partitions, path) expected_partition = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-04-30"] self.assertEqual(partitions, expected_partition) def test__add_partition_name(self): path = "/data/master/pctk/data/t_pctk_rcc_balance/" path = self.dataFrame._DataFrameUtils__add_partition_name(path) expected_path = "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=" self.assertEqual(path, expected_path) def test__get_paths_with_process_name(self): paths = self.dataFrame._DataFrameUtils__get_paths_with_process_name(self.path, partition_number=2) expected_paths = [ "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31", "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30" ] self.assertEqual(paths, expected_paths) def test_read_dataframe(self): dataframe = self.dataFrame.read_dataframe(self.path) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe) def test_read_dataframe_with_path(self): paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30", "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"] dataframe = self.dataFrame.read_dataframe(paths=paths) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe) def test_read_dataframe_with_path_retrieving_partition_name(self): paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30", "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"] dataframe = self.dataFrame.read_dataframe(paths=paths, options={'basePath': self.path}) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe) self.assertTrue("cutoff_date" in dataframe.schema.names) def test_read_dataframes(self): dataframe = self.dataFrame.read_dataframes(self.path, partition_number=1) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe) def test_read_dataframes_with_date_range(self): dataframe = self.dataFrame.read_dataframes(self.path, process_date=["2020-05-31", "2020-07-31"], options={"basePath": self.path}) empty_dataframe = sqlContext.createDataFrame([], StructType([])) dates = dataframe.select("cutoff_date").dropDuplicates().collect() expected_dates = [Row(cutoff_date=datetime.date(2020, 7, 31)), Row(cutoff_date=datetime.date(2020, 5, 31)), Row(cutoff_date=datetime.date(2020, 6, 30))] self.assertNotEqual(dataframe, empty_dataframe) self.assertEqual(dates, expected_dates)
def setUp(self): self.dataFrame = DataFrameUtils(spark, sc=sc) self.path = "/data/master/pctk/data/t_pctk_rcc_balance" self.process_date = "2020-07-12"
import math import numpy as np from utils.DataFrameUtils import DataFrameUtils data = DataFrameUtils.create_dataframe_from_dict({ "x1": [1, 1, 1, 1], "x2": [0, 0, 1, 1], "x3": [0, 1, 0, 1], "z": [1, 1, 1, 0] }) def verify_values(array1, array2): return (array1 == array2).all() def perceptron(x1, x2, x3, w1, w2, w3, t): return (x1 * w1 + x2 * w2 + x3 * w3 - t) def tangente_hiperbolica(x1, x2, x3, w1, w2, w3, t): val = (2 / (1 + math.exp(-2 * perceptron(x1, x2, x3, w1, w2, w3, t)))) - 1 return 1 if val >= 0 else 0 weights = [x * 0.1 for x in range(-10, 11)] thetas = [x * 0.1 for x in range(0, 11)] success_values = list() for t in thetas:
t = random.randint(-5, 5) return (w1, w2, t) def verify_values(array1, array2): return (array1 == array2).all() def sigmoidea(x1, x2, w1, w2, t): percept = 1 / (1 + math.exp(-1 * (x1 * w1 + x2 * w2 - t))) return 1 if percept >= 0.5 else 0 data = DataFrameUtils.create_dataframe_from_dict({ "x1": [0, 1, 0, 1], "x2": [0, 0, 1, 1], "y": [1, 1, 1, 0] }) print(data) while True: w1, w2, t = get_random_numbers() col = data.apply(lambda row: sigmoidea(row["x1"], row["x2"], w1, w2, t), axis=1) print(f"Testing: w1 = {w1}, w2 = {w2}, t = {t}") if verify_values(data["y"], col): print("Success...") break print("\nRequired values: ") print(f"\tw1 = {w1}\n\tw2 = {w2}\n\tt = {t}")