def import_item_info(): start = time.time() df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv') df = df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']] df.drop_duplicates(inplace=True) execute_sql('drop table if exists item_info') execute_sql_from_file('item_info') copy_from_options = {'columns': ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']} dataframe_to_table_bulk(df, 'item_info', copy_from_options=copy_from_options) end = time.time() print(f'import_sales took {round(end - start)} seconds')
def import_sales(): start = time.time() df = pd.read_csv(raw_data_folder() / 'sales_train_validation.csv') df.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], inplace=True, axis='columns') df2 = pd.melt(df, id_vars=['id']) execute_sql('drop view if exists sales_ext') execute_sql('drop table if exists sales_raw') execute_sql('drop table if exists sales') execute_sql_from_file('sales_raw') dataframe_to_table_bulk(df2, 'sales_raw') execute_sql_from_file('sales') execute_sql('drop table sales_raw') end = time.time() print(f'import_sales took {round(end - start)} seconds')
def import_sell_prices(): start = time.time() execute_sql('drop table if exists prices') execute_sql_from_file('prices') conn = get_connection() cur = conn.cursor() with open(raw_data_folder() / 'sell_prices.csv', 'r') as f: f.readline() # don't read the column names as a row in copy_from cur.copy_from(file=f, table='prices', sep=',') cur.close() conn.close() end = time.time() print(f'import_sell_prices took {round(end - start)} seconds')
def write_validation_results_to_db(model, model_name: str, params: str, train_size=np.nan, validation_size=VALIDATION_SIZE, numeric_only=False): execute_sql_from_file('validation') sql = f"DELETE FROM validation where model_name = '{model_name}'" execute_sql(sql) [val_x, val_y, ids] = collect_features(data_set='validation', size=validation_size, numeric_only=numeric_only) pred_values = model.predict(val_x) predictions = pd.DataFrame( data={ 'numeric_id': ids['numeric_id'], 'predicted': pred_values, 'date': ids['date'], 'target': val_y, 'cum_mse': ids['cum_mse'] }) predictions['model_name'] = model_name dataframe_to_table_bulk(df=predictions[[ 'model_name', 'date', 'numeric_id', 'predicted', 'cum_mse', 'target' ]], table='validation') execute_sql_from_file('model_info') model_info = pd.DataFrame( data={ 'model_name': [model_name], 'rmse': [get_rmse(model_name)], 'created_at': [f"{datetime.now():%Y-%m-%d %H:%M}"], 'params': [str(params)], 'train_size': [train_size], 'features': [", ".join(list(val_x))], 'git_commit': [get_git_commit()] }) dataframe_to_table(model_info, table='model_info')
def import_calendar(): df = pd.read_csv(raw_data_folder() / 'calendar.csv', parse_dates=['date']) execute_sql('drop table if exists calendar') execute_sql_from_file('calendar') dataframe_to_table_bulk(df, 'calendar')
import optuna from catboost import CatBoostRegressor from generics.postgres import dataframe_from_sql, execute_sql, execute_sql_from_file, create_sa_string 7 from datetime import datetime from model_utilities import collect_features, write_validation_results_to_db, eval_lgbm_model execute_sql_from_file('validation') sql = "DELETE FROM validation where model_name = 'CatBoost'" execute_sql(sql) train_size = 800000 [test_x, test_y, ids] = collect_features(data_set='test', size=100000, numeric_only=False) [x, y, ids] = collect_features(data_set='train', size=train_size, numeric_only=False) params = dict( cat_features=['wday', 'dept_id', 'state_id', 'store_id', 'snap_status'], loss_function='RMSE', learning_rate=0.002, iterations=5000, random_strength=2, min_data_in_leaf=20) model = CatBoostRegressor(verbose=True, **params)