Ejemplo n.º 1
0
# DEALINGS IN THE SOFTWARE.

import numpy as np
import pandas as pd

import getml.models.aggregations as aggregations
import getml.datasets as datasets
import getml.engine as engine
import getml.models.loss_functions as loss_functions
import getml.models as models
import getml.data as data
import getml.predictors as predictors

# ----------------

engine.set_project("examples")

# ----------------
# Generate artificial dataset
# The problem we create looks like this:
#
# SELECT COUNT( * )
# FROM POPULATION t1
# LEFT JOIN PERIPHERAL t2
# ON t1.join_key = t2.join_key
# WHERE (
#    ( t1.time_stamp - t2.time_stamp <= 0.5 )
# ) AND t2.time_stamp <= t1.time_stamp
# GROUP BY t1.join_key,
#          t1.time_stamp;
Ejemplo n.º 2
0
def test_relboost_same_units():
    """Check if the same results will be obtained regardless of whether
    the units are assigned to the DataFrame, to the Columns, or to the
    RelboostModel.

    """

    # ----------------------------------------------------------------

    engine.set_project("examples")

    seed = 33231

    units = {"column_01": "column_01"}

    # ----------------------------------------------------------------

    # Generate artificial dataset
    # The problem we create looks like this:
    #
    # SELECT COUNT( * )
    # FROM POPULATION t1
    # LEFT JOIN PERIPHERAL t2
    # ON t1.join_key = t2.join_key
    # WHERE (
    #    ( t1.time_stamp - t2.time_stamp <= 0.5 )
    # ) AND t2.time_stamp <= t1.time_stamp
    # GROUP BY t1.join_key,
    #          t1.time_stamp;

    # ----------------------------------------------------------------

    # Assign the units to the columns
    population_table_columns, peripheral_table_columns = datasets.make_same_units_numerical(
        random_state=seed)

    population_table_columns.set_unit("column_01", "column_01")
    peripheral_table_columns.set_unit("column_01", "column_01")

    population_placeholder_columns = population_table_columns.to_placeholder()
    peripheral_placeholder_columns = peripheral_table_columns.to_placeholder()
    population_placeholder_columns.join(peripheral_placeholder_columns,
                                        "join_key", "time_stamp")

    # ----------------------------------------------------------------

    model_columns, features_columns, yhat_columns, scores_columns = _fit_model(
        population_table_columns, peripheral_table_columns,
        population_placeholder_columns, peripheral_placeholder_columns, seed,
        dict())

    # ----------------------------------------------------------------

    # Assign units to Model
    population_table_model, peripheral_table_model = datasets.make_same_units_numerical(
        random_state=seed)

    population_placeholder_model = population_table_model.to_placeholder()
    peripheral_placeholder_model = peripheral_table_model.to_placeholder()
    population_placeholder_model.join(peripheral_placeholder_model, "join_key",
                                      "time_stamp")

    # ----------------------------------------------------------------

    model_model, features_model, yhat_model, scores_model = _fit_model(
        population_table_model, peripheral_table_model,
        population_placeholder_model, peripheral_placeholder_model, seed,
        units)

    # ----------------------------------------------------------------

    # Check whether the results are the same.
    assert scores_model == scores_columns

    assert (yhat_model == yhat_columns).all()

    # ----------------------------------------------------------------

    engine.delete_project("examples")
    port=3306,
    dbname="mydb",
    user="******",
    password="******",
    time_formats=['%Y/%m/%d']
)

# -----------------------------------------------------------------------------
# Set up folders - you need to insert folders on your computer

# The folder that contains expd151.csv
RAW_DATA_FOLDER = os.getenv("HOME") + "/Downloads/diary15/"

# -----------------------------------------------------------------------------

engine.set_project("CE")

# -----------------------------------------------------------------------------
# Begin timing

begin = time.time()

# #############################################################################
# Load data.

# -----------------------------------------------------------------------------
# Load EXPD

expd_fnames = [
    RAW_DATA_FOLDER + "expd151.csv",
    RAW_DATA_FOLDER + "expd152.csv",
Ejemplo n.º 4
0
# ```

# 
# ## Staging the data
# 
# We will start by starting a new project in the getML engine and loading
# the prepared data tables into the Python environment.

# In[ ]:


import os
import pandas as pd
import getml.engine as engine

engine.set_project("gettingStarted")

# Location inside this repository the data is kept.
source_path = os.path.join(os.getcwd(), "../../../data/consumer_expenditures/")

CE_population_training = pd.read_csv(os.path.join(source_path, "CE_population_training.csv"))
CE_population_validation = pd.read_csv(os.path.join(source_path, "CE_population_validation.csv"))
CE_peripheral = pd.read_csv(os.path.join(source_path, "CE_peripheral.csv"))


# In order for the automated feature engineering to get the most out of
# the data, we have to provided some additional information about its
# content. If a column contains e.g. the type of a product encoded in
# integers, operations like comparisons, summation, or the extraction
# the maximum would most probably make no sense. It, therefore, needs to
# be of recognized as *categorical* instead of *discrete*.