return y_pred


y_pred = predict_float_missing_value(study.user_attrs['best_booster'],
                                     x_train_full, y_train_full, x_test_full)

timer(start_time)

imputed_df = pd.DataFrame()
imputed_df[the_col + '_imp'] = data.loc[:, the_col].copy()
imputed_df.loc[idx_missing, the_col + '_imp'] = y_pred.flatten()

os.chdir('/kaggle/working')
pickle.dump(imputed_df, open('3imputed_df.pkl', 'wb'))
sshColab.upload_to_gcs(project, bucket_name,
                       f'tps-apr-2021-label/3imputed_df.pkl',
                       f'/kaggle/working/3imputed_df.pkl')

# ANCHOR Fare
sshColab.download_to_colab(
    project,
    bucket_name,
    destination_directory='/kaggle/working',
    remote_blob_path='tps-apr-2021-label/1clean_data.pkl',
    local_file_name='1clean_data.pkl')
data = pickle.load(open('/kaggle/working/1clean_data.pkl', 'rb'))

sshColab.download_to_colab(
    project,
    bucket_name,
    destination_directory='/kaggle/working',
Esempio n. 2
0
#         study.set_user_attr(key="best_booster", value=trial.user_attrs["best_booster"])
#         # SOURCE retrieve the best number of estimators https://github.com/optuna/optuna/issues/1169

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=2021, multivariate=True)
    #     pruner = optuna.pruners.MedianPruner(n_warmup_steps=3)
)
study.optimize(
    lambda trial: objective(trial, x_pca_transformed[:train_rows, :],
                            train_label.iloc[:train_rows].values, params),
    n_trials=N_TRIALS,
    timeout=TIMEOUT,
    n_jobs=1
    #                callbacks=[save_best]
)

hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")
# best_model=study.user_attrs["best_booster"]

pickle_to_save = '10linearsvc6hrs.pkl'
os.chdir('/kaggle/working')
pickle.dump(best_model, open(f'{pickle_to_save}', 'wb'))
sshColab.upload_to_gcs(project, bucket_name,
                       f'tps-apr-2021-label/{pickle_to_save}',
                       f'/kaggle/working/{pickle_to_save}')
# %%
# data.loc[data['Embarked']=='<NA>', 'EmbarkedCount']
data['EmbarkedCount'] = data['EmbarkedCount'].replace(527, -1)

del data['Embarked']

# Age and Fare
# SOURCE dealing with outliers even for tree-based algorithm easier for splitting - https://www.kdnuggets.com/2018/08/make-machine-learning-models-robust-outliers.html

# TODO data.loc[data['Age'].notnull(), 'Age_rg'] = rank_gauss(data.loc[data['Age'].notnull(), 'Age'].values)
# TODO data.loc[data['Fare'].notnull(), 'Fare_rg'] = rank_gauss(data.loc[data['Fare'].notnull(), 'Fare'].values)

#DONE!
os.chdir('/kaggle/working')
pickle.dump(data, open('1parsed_data.pkl', 'wb'))
sshColab.upload_to_gcs(project, bucket_name,
                       'tps-apr-2021-label/1parsed_data.pkl',
                       '/kaggle/working/1parsed_data.pkl')

#%%
# ------------------------------------ Age ----------------------------------- #

sshColab.download_to_colab(
    project,
    bucket_name,
    destination_directory='/kaggle/working',
    remote_blob_path='tps-apr-2021-label/1parsed_data.pkl',
    local_file_name='1parsed_data.pkl')
parsed_data = pickle.load(open('/kaggle/working/1parsed_data.pkl', 'rb'))
data = parsed_data.copy()
data = data.fillna(-1)