usecols=mapping['use_cols']) if mapping['drop_columns']: train_data.drop(mapping['drop_columns'], axis=1, inplace=True) gc.collect() if mapping['concat_type'] == 'concat': print('concating...') train_data = train_data[mask] gc.collect() fold_data = train_utils.concat_stupidly(fold_data, train_data) assert len(fold_data) == len(train_data) elif mapping['concat_type'] == 'merge': print('merging...') for i in range(2): fold_data = train_utils.map_atom_info( fold_data, train_data, i) else: raise KeyError() del train_data gc.collect() assert len(fold_data) == mask.sum() print('saving...') fold_data.to_csv(f'../../data/oof_tables/{fold}_fold_oof_tables.csv', index=False) del fold_data gc.collect()
nrows = No if not debug: if os.path.isfile(result_filename): assert False, "Result file exists!" if os.path.isfile(sub_filename): assert False, "Submission file exists!" print("reading data...") train = pd.read_csv('../data/train.csv', nrows=nrows) test = pd.read_csv('../data/test.csv', nrows=nrows) structures = pd.read_csv('../data/structures.csv', nrows=nrows) sub = pd.read_csv('../data/sample_submission.csv', nrows=nrows) train = map_atom_info(train, structures, 0) train = map_atom_info(train, structures, 1) test = map_atom_info(test, structures, 0) test = map_atom_info(test, structures, 1) train_p_0 = train[['x_0', 'y_0', 'z_0']].values train_p_1 = train[['x_1', 'y_1', 'z_1']].values test_p_0 = test[['x_0', 'y_0', 'z_0']].values test_p_1 = test[['x_1', 'y_1', 'z_1']].values train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1) test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1) train['dist_x'] = (train['x_0'] - train['x_1'])**2 test['dist_x'] = (test['x_0'] - test['x_1'])**2 train['dist_y'] = (train['y_0'] - train['y_1'])**2
if not debug: if os.path.isfile(result_filename): assert False, "Result file exists!" train = pd.read_csv('../../data/train.csv', nrows=nrows) train_feats = pd.read_csv('../../data/train_stat_features.csv', nrows=nrows) print('assert checking...') for col in ['molecule_name', 'atom_index_0', 'atom_index_1']: assert (train[col] == train_feats[col]).sum() == len(train) structures = pd.read_csv('../../data/structures.csv') for i in range(2): train = map_atom_info(train, structures, i) train = concat_dataframes(train, train_feats) if debug: train.to_csv('../../data/debug_data/stat_merge.csv', index=False) train = artgor_utils.reduce_mem_usage(train) print("creating folds...") sorted_train = train.sort_values([ "scalar_coupling_constant", "type", "dist", ]) sorted_train.index = range(0, len(sorted_train))
left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'], right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type']) print("mapping info about atoms...") debug = False if debug: train = train[:100] test = test[:100] n_estimators = 50 n_folds = 3 else: n_folds = 10 n_estimators = 30000 train = train_utils.map_atom_info(train, structures, 0) train = train_utils.map_atom_info(train, structures, 1) test = train_utils.map_atom_info(test, structures, 0) test = train_utils.map_atom_info(test, structures, 1) train_p_0 = train[['x_0', 'y_0', 'z_0']].values train_p_1 = train[['x_1', 'y_1', 'z_1']].values test_p_0 = test[['x_0', 'y_0', 'z_0']].values test_p_1 = test[['x_1', 'y_1', 'z_1']].values train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1) test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1) train['dist_x'] = (train['x_0'] - train['x_1'])**2 test['dist_x'] = (test['x_0'] - test['x_1'])**2 train['dist_y'] = (train['y_0'] - train['y_1'])**2
if debug: nrows = 1000 n_folds = 3 n_estimators = 50 result_filename = None else: nrows = 2 * 10 ** 6 n_folds = 5 n_estimators = 2000 result_filename = '../../results/feature_selection/acsf_features_result.csv' train = pd.read_csv('../../data/train.csv', nrows=nrows) acsf_structures = pd.read_csv('../../data/structure_with_acsf.csv', index_col=0) for i in range(2): train = map_atom_info(train, acsf_structures, i) train_p_0 = train[['x_0', 'y_0', 'z_0']].values train_p_1 = train[['x_1', 'y_1', 'z_1']].values train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1) if debug: train.to_csv('../../data/debug_data/acsf_merge.csv', index=False) train = artgor_utils.reduce_mem_usage(train) print("creating folds...") sorted_train = train.sort_values([ "scalar_coupling_constant", "type", "dist",