def test_split_data(self): features = [ "flujo", "NU_COORD_UTM ESTE", "NU_COORD_UTM NORTE", "°API", "antiguedad" ] label = "BBPD" data_engineering = DataEngineering() data_engineering.load_data(self.csv_path) data = data_engineering.get_data() max_date = data["año"].max() age = max_date - data["año"] data_engineering.add_column("antiguedad", age) flow_data = data["E_FLUJO"].copy().astype("category").cat.codes data_engineering.add_column("flujo", flow_data) data_engineering.set_label(label) data_engineering.set_features(features) data_engineering.split_data() assert data_engineering.x_train is not None assert data_engineering.x_test is not None assert data_engineering.y_train is not None assert data_engineering.y_test is not None
def test_set_get_features(self): features = ["POZO", "mes", "BBPD"] data_engineering = DataEngineering() assert data_engineering.get_features() is None data_engineering.load_data(self.csv_path) data_engineering.set_features(features) assert data_engineering.get_features() == features
data_e.clean_data() # Create new features # "age" feature max_date = data_e.get_data()["año"].max() age = max_date - data_e.get_data()["año"] data_e.add_column("age", age) # "flow" feature flow_data = data_e.get_data()["E_FLUJO"].copy().astype("category").cat.codes data_e.add_column("flow", flow_data) # Set features and label features = ["flow", "NU_COORD_UTM ESTE", "NU_COORD_UTM NORTE", "°API", "age"] label = "BBPD" data_e.set_features(features) data_e.set_label(label) # Split Train-Test data data_e.split_data() # Create a Model model = Regression(data_e) # Train and test the model model.train() print(f"------------------------------\nMean score: {model.score()}") # Make a prediction model.predict(data_e.x_test.iloc[0], data_e.y_test.iloc[0])