Example #1
0
def import_item_info():
    start = time.time()

    df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv')
    df = df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]
    df.drop_duplicates(inplace=True)

    execute_sql('drop table if exists item_info')
    execute_sql_from_file('item_info')

    copy_from_options = {'columns': ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']}
    dataframe_to_table_bulk(df,
                            'item_info',
                            copy_from_options=copy_from_options)

    end = time.time()
    print(f'import_sales took {round(end - start)} seconds')
Example #2
0
def import_sales():
    start = time.time()

    df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv')
    df.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], inplace=True, axis='columns')
    df2 = pd.melt(df, id_vars=['id'])

    execute_sql('drop view if exists sales_ext')
    execute_sql('drop table if exists sales_raw')
    execute_sql('drop table if exists sales')
    execute_sql_from_file('sales_raw')
    dataframe_to_table_bulk(df2, 'sales_raw')

    execute_sql_from_file('sales')
    execute_sql('drop table sales_raw')

    end = time.time()
    print(f'import_sales took {round(end - start)} seconds')
Example #3
0
def import_sell_prices():
    start = time.time()

    execute_sql('drop table if exists prices')
    execute_sql_from_file('prices')

    conn = get_connection()
    cur = conn.cursor()

    with open(raw_data_folder() / 'sell_prices.csv', 'r') as f:
        f.readline()  # don't read the column names as a row in copy_from
        cur.copy_from(file=f, table='prices', sep=',')

    cur.close()
    conn.close()

    end = time.time()
    print(f'import_sell_prices took {round(end - start)} seconds')
Example #4
0
def write_validation_results_to_db(model,
                                   model_name: str,
                                   params: str,
                                   train_size=np.nan,
                                   validation_size=VALIDATION_SIZE,
                                   numeric_only=False):

    execute_sql_from_file('validation')
    sql = f"DELETE FROM validation where model_name = '{model_name}'"
    execute_sql(sql)

    [val_x, val_y, ids] = collect_features(data_set='validation',
                                           size=validation_size,
                                           numeric_only=numeric_only)
    pred_values = model.predict(val_x)
    predictions = pd.DataFrame(
        data={
            'numeric_id': ids['numeric_id'],
            'predicted': pred_values,
            'date': ids['date'],
            'target': val_y,
            'cum_mse': ids['cum_mse']
        })
    predictions['model_name'] = model_name

    dataframe_to_table_bulk(df=predictions[[
        'model_name', 'date', 'numeric_id', 'predicted', 'cum_mse', 'target'
    ]],
                            table='validation')

    execute_sql_from_file('model_info')
    model_info = pd.DataFrame(
        data={
            'model_name': [model_name],
            'rmse': [get_rmse(model_name)],
            'created_at': [f"{datetime.now():%Y-%m-%d %H:%M}"],
            'params': [str(params)],
            'train_size': [train_size],
            'features': [", ".join(list(val_x))],
            'git_commit': [get_git_commit()]
        })
    dataframe_to_table(model_info, table='model_info')
Example #5
0
def import_calendar():
    df = pd.read_csv(raw_data_folder() / 'calendar.csv', parse_dates=['date'])
    execute_sql('drop table if exists calendar')
    execute_sql_from_file('calendar')
    dataframe_to_table_bulk(df, 'calendar')
import optuna
from catboost import CatBoostRegressor

from generics.postgres import dataframe_from_sql, execute_sql, execute_sql_from_file, create_sa_string

7
from datetime import datetime
from model_utilities import collect_features, write_validation_results_to_db, eval_lgbm_model

execute_sql_from_file('validation')
sql = "DELETE FROM validation where model_name = 'CatBoost'"
execute_sql(sql)

train_size = 800000

[test_x, test_y, ids] = collect_features(data_set='test',
                                         size=100000,
                                         numeric_only=False)
[x, y, ids] = collect_features(data_set='train',
                               size=train_size,
                               numeric_only=False)

params = dict(
    cat_features=['wday', 'dept_id', 'state_id', 'store_id', 'snap_status'],
    loss_function='RMSE',
    learning_rate=0.002,
    iterations=5000,
    random_strength=2,
    min_data_in_leaf=20)

model = CatBoostRegressor(verbose=True, **params)