def train_model(): print() mode = menu.mode_selection() chain_mode = input( 'Choose the chain mode (m: multioutput / c: regressorchain): ' ).lower() M = MultiOutputRegressor if chain_mode == 'm' else RegressorChain #X, Y = data.dataset_with_features('train', onehot=False, drop_index_columns=True) X, Y = data.dataset('local', 'train', onehot=False) print(X.shape, Y.shape) # mask_not_all_null = np.any(X[['SPEED_AVG_-4','SPEED_AVG_-3','SPEED_AVG_-2','SPEED_AVG_-1']].notnull(),axis=1) # X = X[mask_not_all_null] # Y = Y[mask_not_all_null] # print('\nAfter cleaning nan') # print(X.shape, Y.shape) weather_cols = ['WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1'] X[weather_cols] = X[weather_cols].fillna('Unknown') weather_cols = [col for col in X.columns if col.startswith('WEATHER_')] categorical_cols = [ 'EMERGENCY_LANE', 'ROAD_TYPE', 'EVENT_DETAIL', 'EVENT_TYPE' ] + weather_cols categorical_cols.extend(['WEEK_DAY', 'IS_WEEKEND']) weather_clusters_cols = [ 'WEATHER_-4_CL', 'WEATHER_-3_CL', 'WEATHER_-2_CL', 'WEATHER_-1_CL' ] X[weather_clusters_cols] = X[weather_clusters_cols].fillna('Unknown') catboost = CatBoost({ 'X': X, 'mode': mode, 'loss_function': 'MAE', 'eval_metric': 'MAE', 'n_estimators': 5000, 'depth': 6, 'learning_rate': 0.1, 'early_stopping_rounds': 100, 'cat_features': categorical_cols }) model = M(catboost) model.fit(X, Y) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False) mae, mae_4 = inout.evaluate(model, X_test, y_test, intermediate=True) print() print(mae) print(mae_4) # save the model mae = round(mae, 5) suffix = input('Insert model name suffix: ') model_folder = 'saved_models' folder.create_if_does_not_exist(model_folder) chain_mode = 'chain' if chain_mode == 'c' else 'multiout' filename = f'catboost_{chain_mode}_{mae}_{suffix}.jl' inout.save(model, os.path.join(model_folder, filename))
f.append(pd.DataFrame([[np.nan, np.nan]], columns=['WEATHER', 'WEATHER_CL']), ignore_index=True) matching = [ i for i in range(len(df.columns.values)) if "WEATHER_-" in df.columns.values[i] ] for i in matching: f_ = f.rename( columns={ 'WEATHER': df.columns.values[i], 'WEATHER_CL': '{}_CL'.format(df.columns.values[i]) }) df = pd.merge(df, f_, left_on=[df.columns.values[i]], right_on=df.columns.values[i], how='left') return df if __name__ == '__main__': from src.utils.menu import mode_selection mode = mode_selection() c = Weather_clusters(mode) print('Creating {}'.format(c.name)) c.save_feature() print(c.read_feature())
}) """ if mode == 'train': # take random validation rows # random_indices = random.shuffle(joined_df.index) # validation_indices = random_indices[0: int(len(random_indices) * validation_split)] # train_df = joined_df.drop(validation_indices) # valid_df = joined_df.loc[validation_indices] """ # save the base dataset filepath = data.get_path_preprocessed(mode, t, 'base_dataset.csv.gz') print('Saving base dataframe to {}'.format(filepath)) joined_df.to_csv(filepath, index=False, compression='gzip') del joined_df print('Done') def print_memory_usage(): process = psutil.Process(os.getpid()) print( f'Current memory usage: {int(process.memory_info().rss / float(2 ** 20))}MB' ) if __name__ == '__main__': mode = menu.mode_selection() create_base_dataset(mode, steps_behind_event=4, steps_after_event=3)