def create_features_full(df): df['molecule_couples'] = df.groupby('molecule_name')['id'].transform( 'count') df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform( 'mean') df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform( 'min') df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform( 'max') df['molecule_dist_std'] = df.groupby('molecule_name')['dist'].transform( 'std') df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0' ])['id'].transform('count') df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1' ])['id'].transform('count') num_cols = ['x_1', 'y_1', 'z_1', 'dist', 'dist_x', 'dist_y', 'dist_z'] cat_cols = ['atom_index_0', 'atom_index_1', 'type', 'atom_1', 'type_0'] aggs = ['mean', 'max', 'std', 'min'] for col in cat_cols: df[f'molecule_{col}_count'] = df.groupby( 'molecule_name')[col].transform('count') for cat_col in cat_cols: for num_col in num_cols: for agg in aggs: df[f'molecule_{cat_col}_{num_col}_{agg}'] = df.groupby( ['molecule_name', cat_col])[num_col].transform(agg) df[f'molecule_{cat_col}_{num_col}_{agg}_diff'] = df[ f'molecule_{cat_col}_{num_col}_{agg}'] - df[num_col] df[f'molecule_{cat_col}_{num_col}_{agg}_div'] = df[ f'molecule_{cat_col}_{num_col}_{agg}'] / df[num_col] df = artgor_utils.reduce_mem_usage(df) return df
def oof_features(df): num_cols = ['fc', 'sd', 'pso', 'dso'] cat_cols = ['type', 'atom_index_0', 'atom_index_1'] aggs = ['mean', 'max', 'std', 'min'] for cat_col in cat_cols: for num_col in num_cols: for agg in aggs: df[f'molecule_{cat_col}_{num_col}_{agg}'] = \ df.groupby(['molecule_name', cat_col])[num_col].transform(agg) df[f'molecule_{cat_col}_{num_col}_{agg}_diff'] = df[ f'molecule_{cat_col}_{num_col}_{agg}'] - \ df[num_col] df = artgor_utils.reduce_mem_usage(df) return df
for f in ['type', 'atom_0', 'atom_1']: train[f] = train[f].astype("category") test[f] = test[f].astype("category") print('reading acsf...') acsf_cols = train_utils.acsf_cols[:use_acsf_cols] + [ 'molecule_name', 'atom_index', ] acsf_descr = pd.read_csv(f"{file_folder}/structure_with_acsf.csv", nrows=nrows, index_col=False, usecols=acsf_cols) acsf_descr = artgor_utils.reduce_mem_usage(acsf_descr) print('mapping acsf...') for i in range(2): train = map_acsf_info(train, acsf_descr, i) test = map_acsf_info(test, acsf_descr, i) useless_cols = train_utils.find_useless_cols(train) train = train.drop(useless_cols, axis=1) test = test.drop(useless_cols, axis=1) print(f'dropped {len(useless_cols)} cols from train...') del acsf_descr, useless_cols train = artgor_utils.reduce_mem_usage(train) test = artgor_utils.reduce_mem_usage(test) gc.collect()
def create_features(df): df['molecule_couples'] = df.groupby('molecule_name')['id'].transform( 'count') df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform( 'mean') df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform( 'min') df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform( 'max') df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0' ])['id'].transform('count') df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1' ])['id'].transform('count') df[f'molecule_atom_index_0_x_1_std'] = \ df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std') df[f'molecule_atom_index_0_y_1_mean'] = \ df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean') df[f'molecule_atom_index_0_y_1_mean_diff'] = df[ f'molecule_atom_index_0_y_1_mean'] - \ df['y_1'] df[f'molecule_atom_index_0_y_1_mean_div'] = df[ f'molecule_atom_index_0_y_1_mean'] / \ df['y_1'] df[f'molecule_atom_index_0_y_1_max'] = \ df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max') df[f'molecule_atom_index_0_y_1_max_diff'] = df[ f'molecule_atom_index_0_y_1_max'] - \ df['y_1'] df[f'molecule_atom_index_0_y_1_std'] = \ df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std') df[f'molecule_atom_index_0_z_1_std'] = \ df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std') df[f'molecule_atom_index_0_dist_mean'] = \ df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean') df[f'molecule_atom_index_0_dist_mean_diff'] = df[ f'molecule_atom_index_0_dist_mean'] - \ df['dist'] df[f'molecule_atom_index_0_dist_mean_div'] = df[ f'molecule_atom_index_0_dist_mean'] / \ df['dist'] df[f'molecule_atom_index_0_dist_max'] = \ df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max') df[f'molecule_atom_index_0_dist_max_diff'] = df[ f'molecule_atom_index_0_dist_max'] - \ df['dist'] df[f'molecule_atom_index_0_dist_max_div'] = df[ f'molecule_atom_index_0_dist_max'] / \ df['dist'] df[f'molecule_atom_index_0_dist_min'] = \ df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min') df[f'molecule_atom_index_0_dist_min_diff'] = df[ f'molecule_atom_index_0_dist_min'] - \ df['dist'] df[f'molecule_atom_index_0_dist_min_div'] = df[ f'molecule_atom_index_0_dist_min'] / \ df['dist'] df[f'molecule_atom_index_0_dist_std'] = \ df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std') df[f'molecule_atom_index_0_dist_std_diff'] = df[ f'molecule_atom_index_0_dist_std'] - \ df['dist'] df[f'molecule_atom_index_0_dist_std_div'] = df[ f'molecule_atom_index_0_dist_std'] / \ df['dist'] df[f'molecule_atom_index_1_dist_mean'] = \ df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean') df[f'molecule_atom_index_1_dist_mean_diff'] = df[ f'molecule_atom_index_1_dist_mean'] - \ df['dist'] df[f'molecule_atom_index_1_dist_mean_div'] = df[ f'molecule_atom_index_1_dist_mean'] / \ df['dist'] df[f'molecule_atom_index_1_dist_max'] = \ df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max') df[f'molecule_atom_index_1_dist_max_diff'] = df[ f'molecule_atom_index_1_dist_max'] - \ df['dist'] df[f'molecule_atom_index_1_dist_max_div'] = df[ f'molecule_atom_index_1_dist_max'] / \ df['dist'] df[f'molecule_atom_index_1_dist_min'] = \ df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min') df[f'molecule_atom_index_1_dist_min_diff'] = df[ f'molecule_atom_index_1_dist_min'] - \ df['dist'] df[f'molecule_atom_index_1_dist_min_div'] = df[ f'molecule_atom_index_1_dist_min'] / \ df['dist'] df[f'molecule_atom_index_1_dist_std'] = \ df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std') df[f'molecule_atom_index_1_dist_std_diff'] = df[ f'molecule_atom_index_1_dist_std'] - \ df['dist'] df[f'molecule_atom_index_1_dist_std_div'] = df[ f'molecule_atom_index_1_dist_std'] / \ df['dist'] df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1' ])['dist'].transform('mean') df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1' ])['dist'].transform('min') df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - \ df['dist'] df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df[ 'dist'] df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1' ])['dist'].transform('std') df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - \ df['dist'] df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0' ])['dist'].transform('std') df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - \ df['dist'] df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type' ])['dist'].transform('mean') df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df[ 'dist'] df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df[ 'dist'] df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type' ])['dist'].transform('max') df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type' ])['dist'].transform('min') df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type' ])['dist'].transform('std') df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df[ 'dist'] df = artgor_utils.reduce_mem_usage(df) return df
test_p_0 = test[['x_0', 'y_0', 'z_0']].values test_p_1 = test[['x_1', 'y_1', 'z_1']].values train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1) test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1) train['dist_x'] = (train['x_0'] - train['x_1'])**2 test['dist_x'] = (test['x_0'] - test['x_1'])**2 train['dist_y'] = (train['y_0'] - train['y_1'])**2 test['dist_y'] = (test['y_0'] - test['y_1'])**2 train['dist_z'] = (train['z_0'] - train['z_1'])**2 test['dist_z'] = (test['z_0'] - test['z_1'])**2 train['type_0'] = train['type'].apply(lambda x: x[0]) test['type_0'] = test['type'].apply(lambda x: x[0]) train = artgor_utils.reduce_mem_usage(train) test = artgor_utils.reduce_mem_usage(test) train = create_features(train) test = create_features(test) print("label encoding...") for f in ['atom_index_0', 'atom_index_1', 'atom_1', 'type_0', 'type']: if f in good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) print("creating folds...") n_folds = 10
'num_leaves': 512, 'objective': 'regression', 'learning_rate': 0.01, "boosting_type": "gbdt", "subsample_freq": 1, "subsample": 0.9, "bagging_seed": 11, "metric": 'mae', 'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501, 'colsample_bytree': 0.9, 'device': 'gpu', 'gpu_device_id': 0 } X = artgor_utils.reduce_mem_usage(X) X_test = artgor_utils.reduce_mem_usage(X_test) gc.collect() print("training models...") result_dict_lgb = artgor_utils.train_model_regression(X=X, X_test=X_test, y=y, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True, verbose=100, early_stopping_rounds=1000, n_estimators=n_estimators, res_filename=result_filename
f'../data/{type}_test_distance_based_feats.csv') atoms_categorical_cols = [f'atom_{i}' for i in range(2, 10)] for df in [type_dist_test, type_dist_train]: df[atoms_categorical_cols] = df[atoms_categorical_cols].astype( 'category') if type_num == 0: best_features = best_features + list(type_dist_train.columns) print('concating distance features...') type_train = train_utils.concat_stupidly(type_train, type_dist_train) type_test = train_utils.concat_stupidly(type_test, type_dist_test) del type_dist_train, type_dist_test gc.collect() type_train = artgor_utils.reduce_mem_usage(type_train) type_test = artgor_utils.reduce_mem_usage(type_test) gc.collect() params = types_config[type] result_filename = result_filename_prefix / f'100k_iters_distance_feats_{use_best_columns}_feats_types_split_models_{type}.npy' print('X_test_t.shape ', type_test.shape) print('X_t.shape ', type_train.shape) print('Training...') result_dict_lgb = artgor_utils.train_model_regression( X=type_train, y=type_y, X_test=type_test, params=params, columns=best_features,