def GCStoLocal(filename): sshColab.download_to_colab( project, bucket_name, destination_directory='/kaggle/working', remote_blob_path=f'tps-apr-2021-label/{filename}', local_file_name=f'{filename}') return pickle.load(open(f'/kaggle/working/{filename}', 'rb'))
# TODO data.loc[data['Age'].notnull(), 'Age_rg'] = rank_gauss(data.loc[data['Age'].notnull(), 'Age'].values) # TODO data.loc[data['Fare'].notnull(), 'Fare_rg'] = rank_gauss(data.loc[data['Fare'].notnull(), 'Fare'].values) #DONE! os.chdir('/kaggle/working') pickle.dump(data, open('1parsed_data.pkl', 'wb')) sshColab.upload_to_gcs(project, bucket_name, 'tps-apr-2021-label/1parsed_data.pkl', '/kaggle/working/1parsed_data.pkl') #%% # ------------------------------------ Age ----------------------------------- # sshColab.download_to_colab( project, bucket_name, destination_directory='/kaggle/working', remote_blob_path='tps-apr-2021-label/1parsed_data.pkl', local_file_name='1parsed_data.pkl') parsed_data = pickle.load(open('/kaggle/working/1parsed_data.pkl', 'rb')) data = parsed_data.copy() data = data.fillna(-1) the_col = 'Age' idx_present = data.loc[data[the_col] != -1, :].index.tolist() idx_missing = data.loc[data[the_col] == -1, :].index.tolist() data[the_col] = data[the_col].replace(-1, np.nan) x_train_full = data.loc[idx_present, :].drop(the_col, axis=1) y_train_full = data.loc[idx_present, :][the_col] x_test_full = data.loc[idx_missing, :].drop(the_col, axis=1) cat_features = [
train_df = pd.read_csv( '/kaggle/input/tabular-playground-series-apr-2021/train.csv') test_df = pd.read_csv( '/kaggle/input/tabular-playground-series-apr-2021/test.csv') train_label = train_df['Survived'] train_id = train_df['PassengerId'] test_id = test_df['PassengerId'] del train_df['Survived'], train_df['PassengerId'] del test_df['PassengerId'] train_rows = train_df.shape[0] sshColab.download_to_colab( project, bucket_name, destination_directory='/kaggle/working', remote_blob_path='tps-apr-2021-label/1clean_data.pkl', local_file_name='1clean_data.pkl') data = pickle.load(open('/kaggle/working/1clean_data.pkl', 'rb')) sshColab.download_to_colab( project, bucket_name, destination_directory='/kaggle/working', remote_blob_path='tps-apr-2021-label/2missing_code_map.pkl', local_file_name='2missing_code_map.pkl') missing_code_map = pickle.load( open('/kaggle/working/2missing_code_map.pkl', 'rb')) sshColab.download_to_colab( project,
train_rows = train_df.shape[0] def timer(start_time=None): if not start_time: start_time = datetime.now() return start_time elif start_time: thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600) tmin, tsec = divmod(temp_sec, 60) print('Time taken : %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2))) file_to_load = '11dataframe_xgboost_based_trim.pkl' sshColab.download_to_colab(project, bucket_name, destination_directory = '/kaggle/working', remote_blob_path=f'tps-apr-2021-label/{file_to_load}', local_file_name=file_to_load) df = pickle.load(open(f'/kaggle/working/{file_to_load}', 'rb')) file_to_load = '11cols_tuple.pkl' sshColab.download_to_colab(project, bucket_name, destination_directory = '/kaggle/working', remote_blob_path=f'tps-apr-2021-label/{file_to_load}', local_file_name=file_to_load) cat_cols, num_cols = pickle.load(open(f'/kaggle/working/{file_to_load}', 'rb')) def feature_distribution(): plt.figure(figsize=(16, 32)) for i, col in enumerate(df.columns.tolist()): ax = plt.subplot(10, 2, i + 1) ax.set_title(col)