# DEALINGS IN THE SOFTWARE. import numpy as np import pandas as pd import getml.models.aggregations as aggregations import getml.datasets as datasets import getml.engine as engine import getml.models.loss_functions as loss_functions import getml.models as models import getml.data as data import getml.predictors as predictors # ---------------- engine.set_project("examples") # ---------------- # Generate artificial dataset # The problem we create looks like this: # # SELECT COUNT( * ) # FROM POPULATION t1 # LEFT JOIN PERIPHERAL t2 # ON t1.join_key = t2.join_key # WHERE ( # ( t1.time_stamp - t2.time_stamp <= 0.5 ) # ) AND t2.time_stamp <= t1.time_stamp # GROUP BY t1.join_key, # t1.time_stamp;
def test_relboost_same_units(): """Check if the same results will be obtained regardless of whether the units are assigned to the DataFrame, to the Columns, or to the RelboostModel. """ # ---------------------------------------------------------------- engine.set_project("examples") seed = 33231 units = {"column_01": "column_01"} # ---------------------------------------------------------------- # Generate artificial dataset # The problem we create looks like this: # # SELECT COUNT( * ) # FROM POPULATION t1 # LEFT JOIN PERIPHERAL t2 # ON t1.join_key = t2.join_key # WHERE ( # ( t1.time_stamp - t2.time_stamp <= 0.5 ) # ) AND t2.time_stamp <= t1.time_stamp # GROUP BY t1.join_key, # t1.time_stamp; # ---------------------------------------------------------------- # Assign the units to the columns population_table_columns, peripheral_table_columns = datasets.make_same_units_numerical( random_state=seed) population_table_columns.set_unit("column_01", "column_01") peripheral_table_columns.set_unit("column_01", "column_01") population_placeholder_columns = population_table_columns.to_placeholder() peripheral_placeholder_columns = peripheral_table_columns.to_placeholder() population_placeholder_columns.join(peripheral_placeholder_columns, "join_key", "time_stamp") # ---------------------------------------------------------------- model_columns, features_columns, yhat_columns, scores_columns = _fit_model( population_table_columns, peripheral_table_columns, population_placeholder_columns, peripheral_placeholder_columns, seed, dict()) # ---------------------------------------------------------------- # Assign units to Model population_table_model, peripheral_table_model = datasets.make_same_units_numerical( random_state=seed) population_placeholder_model = population_table_model.to_placeholder() peripheral_placeholder_model = peripheral_table_model.to_placeholder() population_placeholder_model.join(peripheral_placeholder_model, "join_key", "time_stamp") # ---------------------------------------------------------------- model_model, features_model, yhat_model, scores_model = _fit_model( population_table_model, peripheral_table_model, population_placeholder_model, peripheral_placeholder_model, seed, units) # ---------------------------------------------------------------- # Check whether the results are the same. assert scores_model == scores_columns assert (yhat_model == yhat_columns).all() # ---------------------------------------------------------------- engine.delete_project("examples")
port=3306, dbname="mydb", user="******", password="******", time_formats=['%Y/%m/%d'] ) # ----------------------------------------------------------------------------- # Set up folders - you need to insert folders on your computer # The folder that contains expd151.csv RAW_DATA_FOLDER = os.getenv("HOME") + "/Downloads/diary15/" # ----------------------------------------------------------------------------- engine.set_project("CE") # ----------------------------------------------------------------------------- # Begin timing begin = time.time() # ############################################################################# # Load data. # ----------------------------------------------------------------------------- # Load EXPD expd_fnames = [ RAW_DATA_FOLDER + "expd151.csv", RAW_DATA_FOLDER + "expd152.csv",
# ``` # # ## Staging the data # # We will start by starting a new project in the getML engine and loading # the prepared data tables into the Python environment. # In[ ]: import os import pandas as pd import getml.engine as engine engine.set_project("gettingStarted") # Location inside this repository the data is kept. source_path = os.path.join(os.getcwd(), "../../../data/consumer_expenditures/") CE_population_training = pd.read_csv(os.path.join(source_path, "CE_population_training.csv")) CE_population_validation = pd.read_csv(os.path.join(source_path, "CE_population_validation.csv")) CE_peripheral = pd.read_csv(os.path.join(source_path, "CE_peripheral.csv")) # In order for the automated feature engineering to get the most out of # the data, we have to provided some additional information about its # content. If a column contains e.g. the type of a product encoded in # integers, operations like comparisons, summation, or the extraction # the maximum would most probably make no sense. It, therefore, needs to # be of recognized as *categorical* instead of *discrete*.