return y_pred y_pred = predict_float_missing_value(study.user_attrs['best_booster'], x_train_full, y_train_full, x_test_full) timer(start_time) imputed_df = pd.DataFrame() imputed_df[the_col + '_imp'] = data.loc[:, the_col].copy() imputed_df.loc[idx_missing, the_col + '_imp'] = y_pred.flatten() os.chdir('/kaggle/working') pickle.dump(imputed_df, open('3imputed_df.pkl', 'wb')) sshColab.upload_to_gcs(project, bucket_name, f'tps-apr-2021-label/3imputed_df.pkl', f'/kaggle/working/3imputed_df.pkl') # ANCHOR Fare sshColab.download_to_colab( project, bucket_name, destination_directory='/kaggle/working', remote_blob_path='tps-apr-2021-label/1clean_data.pkl', local_file_name='1clean_data.pkl') data = pickle.load(open('/kaggle/working/1clean_data.pkl', 'rb')) sshColab.download_to_colab( project, bucket_name, destination_directory='/kaggle/working',
# study.set_user_attr(key="best_booster", value=trial.user_attrs["best_booster"]) # # SOURCE retrieve the best number of estimators https://github.com/optuna/optuna/issues/1169 study = optuna.create_study( direction="maximize", sampler=optuna.samplers.TPESampler(seed=2021, multivariate=True) # pruner = optuna.pruners.MedianPruner(n_warmup_steps=3) ) study.optimize( lambda trial: objective(trial, x_pca_transformed[:train_rows, :], train_label.iloc[:train_rows].values, params), n_trials=N_TRIALS, timeout=TIMEOUT, n_jobs=1 # callbacks=[save_best] ) hp = study.best_params for key, value in hp.items(): print(f"{key:>20s} : {value}") print(f"{'best objective value':>20s} : {study.best_value}") # best_model=study.user_attrs["best_booster"] pickle_to_save = '10linearsvc6hrs.pkl' os.chdir('/kaggle/working') pickle.dump(best_model, open(f'{pickle_to_save}', 'wb')) sshColab.upload_to_gcs(project, bucket_name, f'tps-apr-2021-label/{pickle_to_save}', f'/kaggle/working/{pickle_to_save}') # %%
# data.loc[data['Embarked']=='<NA>', 'EmbarkedCount'] data['EmbarkedCount'] = data['EmbarkedCount'].replace(527, -1) del data['Embarked'] # Age and Fare # SOURCE dealing with outliers even for tree-based algorithm easier for splitting - https://www.kdnuggets.com/2018/08/make-machine-learning-models-robust-outliers.html # TODO data.loc[data['Age'].notnull(), 'Age_rg'] = rank_gauss(data.loc[data['Age'].notnull(), 'Age'].values) # TODO data.loc[data['Fare'].notnull(), 'Fare_rg'] = rank_gauss(data.loc[data['Fare'].notnull(), 'Fare'].values) #DONE! os.chdir('/kaggle/working') pickle.dump(data, open('1parsed_data.pkl', 'wb')) sshColab.upload_to_gcs(project, bucket_name, 'tps-apr-2021-label/1parsed_data.pkl', '/kaggle/working/1parsed_data.pkl') #%% # ------------------------------------ Age ----------------------------------- # sshColab.download_to_colab( project, bucket_name, destination_directory='/kaggle/working', remote_blob_path='tps-apr-2021-label/1parsed_data.pkl', local_file_name='1parsed_data.pkl') parsed_data = pickle.load(open('/kaggle/working/1parsed_data.pkl', 'rb')) data = parsed_data.copy() data = data.fillna(-1)