test_graphs = list() test_targets = list() print('preprocess test molecules ...') for mole in test_moles: test_graphs.append(Graph(structures_groups.get_group(mole), list_atoms)) test_targets.append(test_gp.get_group(mole)) # In[10]: display(valid_gp.get_group(valid_moles[0])) display(structures_groups.get_group(valid_moles[0])) # In[7]: train_dataset = DictDataset(graphs=train_graphs, targets=train_targets) valid_dataset = DictDataset(graphs=valid_graphs, targets=valid_targets) test_dataset = DictDataset(graphs=test_graphs, targets=test_targets) # In[8]: class SchNetUpdateBN(SchNetUpdate): def __init__(self, *args, **kwargs): super(SchNetUpdateBN, self).__init__(*args, **kwargs) with self.init_scope(): self.bn = GraphBatchNormalization(args[0]) def __call__(self, h, adj, **kwargs): v = self.linear[0](h) v = self.cfconv(v, adj)
def run(dir_dataset: Path, batch_size: int, epochs: int, alpha: float, seed: int, debug: bool): tic = time.time() logger = getLogger('root') np.random.seed(seed) random.seed(seed) model = EdgeUpdateNet() model.to_gpu(device=0) train_ids, valid_ids, test_ids = load_dataset(dir_dataset) logger.info(f'train_ids: {train_ids[:5]} ... {train_ids[-5:]}') logger.info(f'valid_ids: {valid_ids[:5]} ... {valid_ids[-5:]}') logger.info(f' test_ids: {test_ids[:5]} ... {test_ids[-5:]}') train_scores = pd.read_csv(dir_dataset / 'train_scores.csv') train_scores.index = train_scores['Id'] target_cols = [ 'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2' ] train_target = train_scores.loc[train_ids][target_cols].values.astype( np.float32) valid_target = train_scores.loc[valid_ids][target_cols].values.astype( np.float32) test_target = np.zeros((len(test_ids), len(target_cols)), dtype=np.float32) loading = pd.read_csv(dir_dataset / 'loading.csv') loading.index = loading['Id'] loading_train = loading.loc[train_ids].iloc[:, 1:].values.astype(np.float32) loading_valid = loading.loc[valid_ids].iloc[:, 1:].values.astype(np.float32) loading_test = loading.loc[test_ids].iloc[:, 1:].values.astype(np.float32) fnc_train, fnc_valid, fnc_test = get_fnc(dir_dataset, train_ids, valid_ids, test_ids, alpha) logger.info(f'fnc train: {fnc_train.shape}') logger.info(f'fnc valid: {fnc_valid.shape}') logger.info(f'fnc test: {fnc_test.shape}') icn_numbers = pd.read_csv('../../input/ICN_numbers.csv') feature = np.zeros((53, len(icn_numbers['net_type'].unique())), dtype=np.float32) feature[range(len(feature)), icn_numbers['net_type_code']] = 1.0 net_type_train = np.tile(np.expand_dims(feature, 0), (len(train_ids), 1, 1)) net_type_valid = np.tile(np.expand_dims(feature, 0), (len(valid_ids), 1, 1)) net_type_test = np.tile(np.expand_dims(feature, 0), (len(test_ids), 1, 1)) spatial_map_train, spatial_map_valid = load_spatial_map( train_ids, valid_ids) spatial_map_test = np.load('../../input/spatial_map_test.npy') train_dataset = DictDataset(loading=loading_train, fnc=fnc_train, net_type=net_type_train, spatial_map=spatial_map_train, targets=train_target, Id=train_ids) valid_dataset = DictDataset(loading=loading_valid, fnc=fnc_valid, net_type=net_type_valid, spatial_map=spatial_map_valid, targets=valid_target, Id=valid_ids) test_dataset = DictDataset(loading=loading_test, fnc=fnc_test, net_type=net_type_test, spatial_map=spatial_map_test, targets=test_target, Id=test_ids) train_iter = chainer.iterators.SerialIterator(train_dataset, batch_size, shuffle=True) valid_iter = chainer.iterators.SerialIterator(valid_dataset, batch_size, shuffle=False, repeat=False) test_iter = chainer.iterators.SerialIterator(test_dataset, batch_size, shuffle=False, repeat=False) optimizer = optimizers.Adam(alpha=1e-3) optimizer.setup(model) updater = training.StandardUpdater(train_iter, optimizer, device=0) trainer = training.Trainer(updater, (epochs, 'epoch'), out="result") trainer.extend(training.extensions.LogReport(filename=f'seed{seed}.log')) trainer.extend(training.extensions.ExponentialShift('alpha', 0.99999)) trainer.extend( training.extensions.observe_value( 'alpha', lambda tr: tr.updater.get_optimizer('main').alpha)) def stop_train_mode(trigger): @make_extension(trigger=trigger) def _stop_train_mode(_): logger.debug('turn off training mode') chainer.config.train = False return _stop_train_mode trainer.extend(stop_train_mode(trigger=(1, 'epoch'))) trainer.extend( training.extensions.PrintReport( ['epoch', 'elapsed_time', 'main/loss', 'valid/main/All', 'alpha'])) trainer.extend( TreNDSEvaluator(iterator=valid_iter, target=model, name='valid', device=0, is_validate=True)) trainer.extend(TreNDSEvaluator(iterator=test_iter, target=model, name='test', device=0, is_submit=True, submission_name=f'submit_seed{seed}.csv'), trigger=triggers.MinValueTrigger('valid/main/All')) chainer.config.train = True trainer.run() trained_result = pd.DataFrame(trainer.get_extension('LogReport').log) best_score = np.min(trained_result['valid/main/All']) logger.info(f'validation score: {best_score: .4f} (seed: {seed})') elapsed_time = time.time() - tic logger.info(f'elapsed time: {elapsed_time / 60.0: .1f} [min]')
def main(): #%% Load datasets train, valid, test, train_moles, valid_moles, test_moles = load_dataset( CTYPE) train_gp = train.groupby('molecule_name') valid_gp = valid.groupby('molecule_name') test_gp = test.groupby('molecule_name') #%% structures = pd.read_csv(DATA_PATH / 'structures.csv') giba_features = pd.read_csv(DATA_PATH / 'unified-features' / 'giba_features.csv', index_col=0) structures = pd.merge(structures, giba_features.drop(['atom_name', 'x', 'y', 'z'], axis=1), on=['molecule_name', 'atom_index']) norm_col = [ col for col in structures.columns if col not in ['molecule_name', 'atom_index', 'atom', 'x', 'y', 'z'] ] structures[norm_col] = (structures[norm_col] - structures[norm_col].mean() ) / structures[norm_col].std() structures = structures.fillna(0) structures_groups = structures.groupby('molecule_name') #%% if CTYPE != 'all': train_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' / 'kuma_dataset' / 'kuma_dataset' / 'train' / '{}_full.csv'.format(CTYPE), index_col=0) else: train_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' / 'kuma_dataset' / 'kuma_dataset' / 'train_all.csv', index_col=0) train_couple = reduce_mem_usage(train_couple) train_couple = train_couple.drop( ['id', 'scalar_coupling_constant', 'type'], axis=1) if CTYPE != 'all': test_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' / 'kuma_dataset' / 'kuma_dataset' / 'test' / '{}_full.csv'.format(CTYPE), index_col=0) else: test_couple = pd.read_csv(DATA_PATH / 'typewise-dataset' / 'kuma_dataset' / 'kuma_dataset' / 'test_all.csv', index_col=0) test_couple = reduce_mem_usage(test_couple) test_couple = test_couple.drop(['id', 'type'], axis=1) couples = pd.concat([train_couple, test_couple]) del train_couple, test_couple couples_norm_col = [ col for col in couples.columns if col not in ['atom_index_0', 'atom_index_1', 'molecule_name', 'type'] ] for col in couples_norm_col: if couples[col].dtype == np.dtype('O'): couples = pd.get_dummies(couples, columns=[col]) else: couples[col] = (couples[col] - couples[col].mean()) / couples[col].std() couples = couples.fillna(0) couples = couples.replace(np.inf, 0) couples = couples.replace(-np.inf, 0) couples_groups = couples.groupby('molecule_name') #%% Make graphs feature_col = [ col for col in structures.columns if col not in ['molecule_name', 'atom_index', 'atom'] ] list_atoms = list(set(structures['atom'])) print('list of atoms') print(list_atoms) train_graphs = list() train_targets = list() train_couples = list() print('preprocess training molecules ...') for mole in tqdm(train_moles): train_graphs.append( Graph(structures_groups.get_group(mole), list_atoms, feature_col)) train_targets.append(train_gp.get_group(mole)) train_couples.append(couples_groups.get_group(mole)) valid_graphs = list() valid_targets = list() valid_couples = list() print('preprocess validation molecules ...') for mole in tqdm(valid_moles): valid_graphs.append( Graph(structures_groups.get_group(mole), list_atoms, feature_col)) valid_targets.append(valid_gp.get_group(mole)) valid_couples.append(couples_groups.get_group(mole)) test_graphs = list() test_targets = list() test_couples = list() print('preprocess test molecules ...') for mole in tqdm(test_moles): test_graphs.append( Graph(structures_groups.get_group(mole), list_atoms, feature_col)) test_targets.append(test_gp.get_group(mole)) test_couples.append(couples_groups.get_group(mole)) #%% Make datasets train_dataset = DictDataset(graphs=train_graphs, targets=train_targets, couples=train_couples) valid_dataset = DictDataset(graphs=valid_graphs, targets=valid_targets, couples=valid_couples) test_dataset = DictDataset(graphs=test_graphs, targets=test_targets, couples=test_couples) #%% Build Model model = SchNet(num_layer=NUM_LAYER) model.to_gpu(device=0) #%% Sampler train_sampler = SameSizeSampler(structures_groups, train_moles, BATCH_SIZE) valid_sampler = SameSizeSampler(structures_groups, valid_moles, BATCH_SIZE, use_remainder=True) test_sampler = SameSizeSampler(structures_groups, test_moles, BATCH_SIZE, use_remainder=True) #%% Iterator, Optimizer train_iter = chainer.iterators.SerialIterator(train_dataset, BATCH_SIZE, order_sampler=train_sampler) valid_iter = chainer.iterators.SerialIterator(valid_dataset, BATCH_SIZE, repeat=False, order_sampler=valid_sampler) test_iter = chainer.iterators.SerialIterator(test_dataset, BATCH_SIZE, repeat=False, order_sampler=test_sampler) optimizer = optimizers.Adam(alpha=1e-3) optimizer.setup(model) #%% Updater if opt.multi_gpu: updater = training.updaters.ParallelUpdater( train_iter, optimizer, # The device of the name 'main' is used as a "master", while others are # used as slaves. Names other than 'main' are arbitrary. devices={ 'main': 0, 'sub1': 1, 'sub2': 2, 'sub3': 3 }, ) else: updater = training.StandardUpdater(train_iter, optimizer, converter=coupling_converter, device=0) # early_stopping stop_trigger = triggers.EarlyStoppingTrigger( patients=EARLY_STOPPING_ROUNDS, monitor='valid/main/ALL_LogMAE', max_trigger=(EPOCH, 'epoch')) trainer = training.Trainer(updater, stop_trigger, out=RESULT_PATH) # trainer = training.Trainer(updater, (100, 'epoch'), out=RESULT_PATH) #%% Evaluator trainer.extend( TypeWiseEvaluator(iterator=valid_iter, target=model, converter=coupling_converter, name='valid', device=0, is_validate=True)) trainer.extend( TypeWiseEvaluator(iterator=test_iter, target=model, converter=coupling_converter, name='test', device=0, is_submit=True)) #%% Other extensions trainer.extend(training.extensions.ExponentialShift('alpha', 0.99999)) trainer.extend(stop_train_mode(trigger=(1, 'epoch'))) trainer.extend( training.extensions.observe_value( 'alpha', lambda tr: tr.updater.get_optimizer('main').alpha)) trainer.extend(training.extensions.LogReport(log_name=f'log_{CTYPE}')) trainer.extend( training.extensions.PrintReport([ 'epoch', 'elapsed_time', 'main/loss', 'valid/main/ALL_LogMAE', 'alpha' ])) # trainer.extend(extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}')) trainer.extend(SaveRestore(filename=f'best_epoch_{CTYPE}'), trigger=triggers.MinValueTrigger('valid/main/ALL_LogMAE')) #%% Train if not opt.test: chainer.config.train = True trainer.run() else: chainer.config.train = False snapshot_path = f'results/chainer/best_epoch_{CTYPE}' chainer.serializers.npz.load_npz(snapshot_path, model, 'updater/model:main/') oof = predict_iter(valid_iter, model) oof.to_csv(f'schnet_{CTYPE}_oof.csv', index=False) #%% Final Evaluation chainer.config.train = False prediction = predict_iter(test_iter, model) prediction.to_csv(f'schnet_{CTYPE}.csv', index=False)
def main(): #%% Load datasets train, valid, test, train_moles, valid_moles, test_moles = load_dataset(CTYPE) train_gp = train.groupby('molecule_name') valid_gp = valid.groupby('molecule_name') test_gp = test.groupby('molecule_name') #%% structures = pd.read_csv(DATA_PATH/'structures.csv') giba_features = pd.read_csv(DATA_PATH/'unified-features'/'giba_features.csv', index_col=0) structures = pd.merge(structures,giba_features.drop(['atom_name','x','y','z'],axis=1),on=['molecule_name','atom_index']) norm_col = [col for col in structures.columns if col not in ['molecule_name','atom_index','atom','x','y','z']] structures[norm_col] = (structures[norm_col]-structures[norm_col].mean())/structures[norm_col].std() structures = structures.fillna(0) structures_groups = structures.groupby('molecule_name') #%% if CTYPE != 'all': train_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'train'/'{}_full.csv'.format(CTYPE),index_col=0) else: train_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'train_all.csv',index_col=0) train_couple = reduce_mem_usage(train_couple) train_couple = train_couple.drop(['id','scalar_coupling_constant','type'],axis=1) if CTYPE != 'all': test_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'test'/'{}_full.csv'.format(CTYPE),index_col=0) else: test_couple = pd.read_csv(DATA_PATH/'typewise-dataset'/'kuma_dataset'/'kuma_dataset'/'test_all.csv',index_col=0) test_couple = reduce_mem_usage(test_couple) test_couple = test_couple.drop(['id','type'],axis=1) couples = pd.concat([train_couple, test_couple]) del train_couple, test_couple couples_norm_col = [col for col in couples.columns if col not in ['atom_index_0','atom_index_1','molecule_name','type']] for col in couples_norm_col: if couples[col].dtype==np.dtype('O'): couples = pd.get_dummies(couples,columns=[col]) else: couples[col] = (couples[col]-couples[col].mean())/couples[col].std() couples = couples.fillna(0) couples = couples.replace(np.inf, 0) couples = couples.replace(-np.inf, 0) couples_groups = couples.groupby('molecule_name') #%% Make graphs feature_col = [col for col in structures.columns if col not in ['molecule_name','atom_index','atom']] list_atoms = list(set(structures['atom'])) print('list of atoms') print(list_atoms) train_graphs = list() train_targets = list() train_couples = list() print('preprocess training molecules ...') for mole in tqdm(train_moles): train_graphs.append(Graph(structures_groups.get_group(mole), list_atoms, feature_col, mole)) train_targets.append(train_gp.get_group(mole)) train_couples.append(couples_groups.get_group(mole)) valid_graphs = list() valid_targets = list() valid_couples = list() print('preprocess validation molecules ...') for mole in tqdm(valid_moles): valid_graphs.append(Graph(structures_groups.get_group(mole), list_atoms, feature_col, mole)) valid_targets.append(valid_gp.get_group(mole)) valid_couples.append(couples_groups.get_group(mole)) test_graphs = list() test_targets = list() test_couples = list() print('preprocess test molecules ...') for mole in tqdm(test_moles): test_graphs.append(Graph(structures_groups.get_group(mole), list_atoms, feature_col, mole)) test_targets.append(test_gp.get_group(mole)) test_couples.append(couples_groups.get_group(mole)) #%% Make datasets train_dataset = DictDataset(graphs=train_graphs, targets=train_targets, couples=train_couples) valid_dataset = DictDataset(graphs=valid_graphs, targets=valid_targets, couples=valid_couples) test_dataset = DictDataset(graphs=test_graphs, targets=test_targets, couples=test_couples) #%% Build Model model = WeaveNet(n_sub_layer=3) model.to_gpu(device=0) #%% Sampler train_sampler = SameSizeSampler(structures_groups, train_moles, BATCH_SIZE) valid_sampler = SameSizeSampler(structures_groups, valid_moles, BATCH_SIZE, use_remainder=True) test_sampler = SameSizeSampler(structures_groups, test_moles, BATCH_SIZE, use_remainder=True) #%% Iterator, Optimizer train_iter = chainer.iterators.SerialIterator( train_dataset, BATCH_SIZE, order_sampler=train_sampler) valid_iter = chainer.iterators.SerialIterator( valid_dataset, BATCH_SIZE, repeat=False, order_sampler=valid_sampler) test_iter = chainer.iterators.SerialIterator( test_dataset, BATCH_SIZE, repeat=False, order_sampler=test_sampler) #%% Predict chainer.config.train = False