def main(): # Parse the arguments. args = parse_arguments() augment = False if args.augment == 'False' else True multi_gpu = False if args.multi_gpu == 'False' else True if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): label_arr = np.asarray(label_list, dtype=np.int32) return label_arr # Apply a preprocessor to the dataset. logging.info('Preprocess train dataset and test dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParserForPair(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_cols=['smiles_1', 'smiles_2']) train = parser.parse(args.train_datafile)['dataset'] test = parser.parse(args.valid_datafile)['dataset'] if augment: logging.info('Utilizing data augmentation in train set') train = augment_dataset(train) num_train = train.get_datasets()[0].shape[0] num_test = test.get_datasets()[0].shape[0] logging.info('Train/test split: {}/{}'.format(num_train, num_test)) if len(args.net_hidden_dims): net_hidden_dims = tuple([ int(net_hidden_dim) for net_hidden_dim in args.net_hidden_dims.split(',') ]) else: net_hidden_dims = () weight_tying = False if args.weight_tying == 'False' else True fp_batch_normalization = True if args.fp_bn == 'True' else False predictor = set_up_predictor( method=args.method, fp_hidden_dim=args.fp_hidden_dim, fp_out_dim=args.fp_out_dim, conv_layers=args.conv_layers, concat_hidden=args.concat_hidden, fp_dropout_rate=args.fp_dropout_rate, fp_batch_normalization=fp_batch_normalization, net_hidden_dims=net_hidden_dims, class_num=class_num, sim_method=args.sim_method, weight_typing=weight_tying, symmetric=args.symmetric, attn_model=args.attn, ) if args.train_pos_neg_ratio != -1.: # Set up the iterator. train_dataset = train.get_datasets() atoms1_train, adjs1_train, atoms2_train, adjs2_train, labels_train = train_dataset labels_train = np.squeeze(labels_train) train_dataset_arr = np.concatenate([ item[:, None] if len(item.shape) == 1 else item for item in list(train_dataset) ], axis=1) pos_train_dataset_arr = train_dataset_arr[labels_train == 1] num_pos_train = pos_train_dataset_arr.shape[0] pos_train_indices = np.arange(0, num_pos_train) neg_train_dataset_arr = train_dataset_arr[labels_train == 0] num_neg_train = neg_train_dataset_arr.shape[0] pos_neg_train_ratio = args.train_pos_neg_ratio num_pos_train = int(pos_neg_train_ratio * num_neg_train) np.random.seed(777) np.random.shuffle(pos_train_indices) pos_train_indices = pos_train_indices[:num_pos_train] pos_train_dataset_arr = pos_train_dataset_arr[pos_train_indices] new_train_dataset_arr = np.concatenate( (pos_train_dataset_arr, neg_train_dataset_arr), axis=0) atoms1_train, adjs1_train = new_train_dataset_arr[:, 0], new_train_dataset_arr[:, 1] atoms2_train, adjs2_train = new_train_dataset_arr[:, 2], new_train_dataset_arr[:, 3] labels_train = new_train_dataset_arr[:, 4].astype(np.int32) labels_train = np.expand_dims(labels_train, axis=1) train = NumpyTupleDataset(atoms1_train, adjs1_train, atoms2_train, adjs2_train, labels_train) num_train = train.get_datasets()[0].shape[0] num_test = test.get_datasets()[0].shape[0] logging.info('Train pos-neg ratio is {:.4f}'.format( args.train_pos_neg_ratio)) logging.info('Train/test number is {}/{}'.format(num_train, num_test)) # if args.loss_func == 'hinge': # modify_dataset_for_hinge(train) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) test_iter = SerialIterator(test, args.batchsize, repeat=False, shuffle=False) metrics_fun = {'accuracy': F.binary_accuracy} loss_func = F.sigmoid_cross_entropy if args.loss_func == 'hinge': logging.info('Loss function is {}'.format(args.loss_func)) loss_func = F.hinge metrics_fun = {'accuracy': F.accuracy} classifier = Classifier(predictor, lossfun=loss_func, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam(alpha=args.learning_rate, weight_decay_rate=args.weight_decay_rate) # optimizer = optimizers.Adam() # optimizer = optimizers.SGD(lr=args.learning_rate) optimizer.setup(classifier) # add regularization if args.max_norm > 0: optimizer.add_hook( chainer.optimizer.GradientClipping(threshold=args.max_norm)) if args.l2_rate > 0: optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.l2_rate)) if args.l1_rate > 0: optimizer.add_hook(chainer.optimizer.Lasso(rate=args.l1_rate)) # Set up the updater. if multi_gpu: logging.info('Using multiple GPUs') updater = training.ParallelUpdater(train_iter, optimizer, devices={ 'main': 0, 'second': 1 }, converter=concat_mols) else: logging.info('Using single GPU') updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. logging.info('Training...') # add stop_trigger parameter early_stop = triggers.EarlyStoppingTrigger(monitor='validation/main/loss', patients=10, max_trigger=(500, 'epoch')) out = 'output' + '/' + args.out trainer = training.Trainer(updater, stop_trigger=early_stop, out=out) # trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(test_iter, classifier, device=args.gpu, converter=concat_mols)) train_eval_iter = SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( AccuracyEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_acc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( AccuracyEvaluator(test_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_acc', pos_labels=1, ignore_labels=-1)) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_roc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(test_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_roc', pos_labels=1, ignore_labels=-1)) trainer.extend( PRCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_prc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( PRCAUCEvaluator(test_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_prc', pos_labels=1, ignore_labels=-1)) # trainer.extend(PrecisionEvaluator( # train_eval_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train_p', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(PrecisionEvaluator( # val_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val_p', # pos_labels=1, ignore_labels=-1)) # # trainer.extend(RecallEvaluator( # train_eval_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train_r', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(RecallEvaluator( # val_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val_r', # pos_labels=1, ignore_labels=-1)) trainer.extend( F1Evaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_f', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( F1Evaluator(test_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_f', pos_labels=1, ignore_labels=-1)) # apply shift strategy to learning rate every 10 epochs # trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=(10, 'epoch')) if args.exp_shift_strategy == 1: trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=triggers.ManualScheduleTrigger( [10, 20, 30, 40, 50, 60], 'epoch')) elif args.exp_shift_strategy == 2: trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=triggers.ManualScheduleTrigger( [5, 10, 15, 20, 25, 30], 'epoch')) elif args.exp_shift_strategy == 3: trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=triggers.ManualScheduleTrigger( [5, 10, 15, 20, 25, 30, 40, 50, 60, 70], 'epoch')) else: raise ValueError('No such strategy to adapt learning rate') # # observation of learning rate trainer.extend(E.observe_lr(), trigger=(1, 'iteration')) entries = [ 'epoch', 'main/loss', 'train_acc/main/accuracy', 'train_roc/main/roc_auc', 'train_prc/main/prc_auc', # 'train_p/main/precision', 'train_r/main/recall', 'train_f/main/f1', 'validation/main/loss', 'val_acc/main/accuracy', 'val_roc/main/roc_auc', 'val_prc/main/prc_auc', # 'val_p/main/precision', 'val_r/main/recall', 'val_f/main/f1', 'lr', 'elapsed_time' ] trainer.extend(E.PrintReport(entries=entries)) # change from 10 to 2 on Mar. 1 2019 trainer.extend(E.snapshot(), trigger=(2, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.ProgressBar()) trainer.extend( E.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend( E.PlotReport(['train_acc/main/accuracy', 'val_acc/main/accuracy'], 'epoch', file_name='accuracy.png')) if args.resume: resume_path = os.path.join(out, args.resume) logging.info( 'Resume training according to snapshot in {}'.format(resume_path)) chainer.serializers.load_npz(resume_path, trainer) trainer.run() # Save the regressor's parameters. model_path = os.path.join(out, args.model_filename) logging.info('Saving the trained models to {}...'.format(model_path)) classifier.save_pickle(model_path, protocol=args.protocol)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers if method == 'node2vec': fname = args.modelpath.split('/')[-1].rsplit('.', 1)[0] r, p, q = fname.split('-')[-3:] r, p, q = int(r[1:]), float(p[1:]), float(q[1:]) print(args.modelpath) print(f"r={r}, p={p}, q={q}") task_type = molnet_default_config[dataset_name]['task_type'] model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label if method == 'node2vec': cache_dir = os.path.join( args.datadir, '{}_{}_r{}_p{}_q{}_{}'.format(dataset_name, method, r, p, q, labels)) else: cache_dir = os.path.join( args.datadir, '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None if method == 'node2vec': cache_dir = os.path.join( args.datadir, '{}_{}_r{}_p{}_q{}_all'.format(dataset_name, method, r, p, q)) else: cache_dir = os.path.join(args.datadir, '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [ dataset_part_filename(p, num_data) for p in ['train', 'valid'] ] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir, modelpath=args.modelpath) # Scale the label values, if necessary. scaler = None if args.scale == 'standardize': if task_type == 'regression': print('Applying standard scaling to the labels.') scaler, dataset_parts = fit_scaler(dataset_parts) else: print('Label scaling is not available for classification tasks.') else: print('No label scaling was selected.') train, valid = dataset_parts[0], dataset_parts[1] # Set up the predictor. if method == 'node2vec': predictor = MLP(class_num, n_unit) else: predictor = set_up_predictor(method, n_unit, conv_layers, class_num, label_scaler=scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = { k: v for k, v in metrics.items() if isinstance(v, types.FunctionType) } loss_fun = molnet_default_config[dataset_name]['loss'] device = chainer.get_device(args.device) if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam(0.0005) optimizer.setup(model) # Save model-related output to this directory. if not os.path.exists(args.out): os.makedirs(args.out) save_json(os.path.join(args.out, 'args.json'), vars(args)) model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # save scaler if args.scale == 'standardize' and task_type == 'regression': pkl.dump(scaler, open(os.path.join(cache_dir, 'standatdize_scaler.pkl'), 'wb')) # Set up the updater. if method == 'node2vec': converter = converter_method_dict['nfp'] # concat_mols else: converter = converter_method_dict[method] updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=converter) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend( E.Evaluator(valid_iter, model, device=device, converter=converter)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # TODO: consider go/no-go of the following block # (i) more reporting for val/evalutaion # (ii) best validation score snapshot if task_type == 'regression': metric_name_list = list(metrics.keys()) if 'RMSE' in metric_name_list: trainer.extend(E.snapshot_object( model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger( 'validation/main/RMSE')) elif 'MAE' in metric_name_list: trainer.extend(E.snapshot_object( model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger( 'validation/main/MAE')) else: print("[WARNING] No validation metric defined?") elif task_type == 'classification': train_eval_iter = iterators.SerialIterator(train, args.batchsize, repeat=False, shuffle=False) if dataset_name in ['muv', 'pcba']: trainer.extend( PRCAUCEvaluator(train_eval_iter, predictor, eval_func=predictor, device=device, converter=converter, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( PRCAUCEvaluator(valid_iter, predictor, eval_func=predictor, device=device, converter=converter, name='val', pos_labels=1, ignore_labels=-1, raise_value_error=False)) trainer.extend( E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MaxValueTrigger('val/main/prc_auc')) else: trainer.extend( ROCAUCEvaluator(train_eval_iter, predictor, eval_func=predictor, device=device, converter=converter, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(valid_iter, predictor, eval_func=predictor, device=device, converter=converter, name='val', pos_labels=1, ignore_labels=-1, raise_value_error=False)) trainer.extend( E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) else: raise NotImplementedError( 'Not implemented task_type = {}'.format(task_type)) trainer.extend(AutoPrintReport()) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = D.get_tox21_label_names() iterator_type = ['serial', 'balanced'] parser = argparse.ArgumentParser( description='Multitask Learning with Tox21.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp', help='graph convolution model to use ' 'as a predictor.') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for logistic ' 'regression. Use all labels if this option ' 'is not specified.') parser.add_argument('--iterator-type', type=str, choices=iterator_type, default='serial', help='iterator type. If `balanced` ' 'is specified, data is sampled to take same number of' 'positive/negative labels during training.') parser.add_argument('--eval-mode', type=int, default=1, help='Evaluation mode.' '0: only binary_accuracy is calculated.' '1: binary_accuracy and ROC-AUC score is calculated') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID to use. Negative value indicates ' 'not to use GPU and to run the code in CPU.') parser.add_argument('--out', '-o', type=str, default='result', help='path to output directory') parser.add_argument('--epoch', '-e', type=int, default=10, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--resume', '-r', type=str, default='', help='path to a trainer snapshot') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--protocol', type=int, default=2, help='protocol version for pickle') parser.add_argument('--model-filename', type=str, default='classifier.pkl', help='file name for pickled model') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None class_num = len(label_names) # Dataset preparation train, val, _ = data.load_dataset(method, labels, num_data=args.num_data) # Network predictor_ = predictor.build_predictor(method, args.unit_num, args.conv_layers, class_num) iterator_type = args.iterator_type if iterator_type == 'serial': train_iter = I.SerialIterator(train, args.batchsize) elif iterator_type == 'balanced': if class_num > 1: raise ValueError('BalancedSerialIterator can be used with only one' 'label classification, please specify label to' 'be predicted by --label option.') train_iter = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1) train_iter.show_label_stats() else: raise ValueError('Invalid iterator type {}'.format(iterator_type)) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) classifier = Classifier(predictor_, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) optimizer = O.Adam() optimizer.setup(classifier) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) trainer.extend(E.LogReport()) eval_mode = args.eval_mode if eval_mode == 0: trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ])) elif eval_mode == 1: train_eval_iter = I.SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor_, device=args.gpu, converter=concat_mols, name='train', pos_labels=1, ignore_labels=-1)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(val_iter, classifier, eval_func=predictor_, device=args.gpu, converter=concat_mols, name='val', pos_labels=1, ignore_labels=-1)) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc', 'validation/main/loss', 'validation/main/accuracy', 'val/main/roc_auc', 'elapsed_time' ])) else: raise ValueError('Invalid accfun_mode {}'.format(eval_mode)) trainer.extend(E.ProgressBar(update_interval=10)) frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) trainer.extend(E.snapshot(), trigger=(frequency, 'epoch')) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run() config = { 'method': args.method, 'conv_layers': args.conv_layers, 'unit_num': args.unit_num, 'labels': args.label } with open(os.path.join(args.out, 'config.json'), 'w') as o: o.write(json.dumps(config)) classifier.save_pickle(os.path.join(args.out, args.model_filename), protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): label_arr = np.asarray(label_list, dtype=np.int32) return label_arr # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParserForPair(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_cols=['smiles_1', 'smiles_2']) dataset = parser.parse(args.datafile)['dataset'] # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. # def set_up_predictor(method, fp_hidden_dim, fp_out_dim, conv_layers, net_hidden_num, class_num, net_layers): # predictor = set_up_predictor(args.method, args.unit_num, # args.conv_layers, class_num) if len(args.net_hidden_dims): net_hidden_dims = tuple([ int(net_hidden_dim) for net_hidden_dim in args.net_hidden_dims.split(',') ]) else: net_hidden_dims = () predictor = set_up_predictor(method=args.method, fp_hidden_dim=args.fp_hidden_dim, fp_out_dim=args.fp_out_dim, conv_layers=args.conv_layers, concat_hidden=args.concat_hidden, fp_dropout_rate=args.fp_dropout_rate, net_hidden_dims=net_hidden_dims, class_num=class_num, sim_method=args.sim_method) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) val_iter = SerialIterator(val, args.batchsize, repeat=False, shuffle=False) metrics_fun = {'accuracy': F.binary_accuracy} classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam(alpha=args.learning_rate, weight_decay_rate=args.weight_decay_rate) # optimizer = optimizers.Adam() # optimizer = optimizers.SGD(lr=args.learning_rate) optimizer.setup(classifier) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) train_eval_iter = SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( AccuracyEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_acc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( AccuracyEvaluator(val_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_acc', pos_labels=1, ignore_labels=-1)) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_roc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(val_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_roc', pos_labels=1, ignore_labels=-1)) trainer.extend( PRCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_prc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( PRCAUCEvaluator(val_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_prc', pos_labels=1, ignore_labels=-1)) # trainer.extend(PrecisionEvaluator( # train_eval_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train_p', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(PrecisionEvaluator( # val_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val_p', # pos_labels=1, ignore_labels=-1)) # # trainer.extend(RecallEvaluator( # train_eval_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train_r', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(RecallEvaluator( # val_iter, classifier, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val_r', # pos_labels=1, ignore_labels=-1)) trainer.extend( F1Evaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train_f', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( F1Evaluator(val_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val_f', pos_labels=1, ignore_labels=-1)) # apply shift strategy to learning rate every 10 epochs # trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=(10, 'epoch')) trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=triggers.ManualScheduleTrigger([10, 20, 30, 40, 50], 'epoch')) # # observation of learning rate trainer.extend(E.observe_lr(), trigger=(1, 'iteration')) entries = [ 'epoch', 'main/loss', 'train_acc/main/accuracy', 'train_roc/main/roc_auc', 'train_prc/main/prc_auc', # 'train_p/main/precision', 'train_r/main/recall', 'train_f/main/f1', 'validation/main/loss', 'val_acc/main/accuracy', 'val_roc/main/roc_auc', 'val_prc/main/prc_auc', # 'val_p/main/precision', 'val_r/main/recall', 'val_f/main/f1', 'lr', 'elapsed_time' ] trainer.extend(E.PrintReport(entries=entries)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.ProgressBar()) if args.resume: resume_path = os.path.join(args.out, args.resume) logging.info( 'Resume training according to snapshot in {}'.format(resume_path)) chainer.serializers.load_npz(resume_path, trainer) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained models to {}...'.format(model_path)) classifier.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() if args['label']: labels = args['label'] class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): label_arr = np.asarray(label_list, dtype=np.int32) return label_arr # Apply a preprocessor to the dataset. logging.info('Preprocess train dataset and valid dataset...') if args['method'] == 'relgcn': preprocessor = preprocess_method_dict['ggnn']() # preprocessor = preprocess_method_dict['rsgcn']() elif args['method'] == 'mpnn': preprocessor = preprocess_method_dict['ggnn']() else: preprocessor = preprocess_method_dict[args['method']]() parser = CSVFileParserForPair(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_cols=['smiles_1', 'smiles_2']) train = parser.parse(args['train_datafile'])['dataset'] valid = parser.parse(args['valid_datafile'])['dataset'] if args['augment']: logging.info('Utilizing data augmentation in train set') train = augment_dataset(train) num_train = train.get_datasets()[0].shape[0] num_valid = valid.get_datasets()[0].shape[0] logging.info('Train/test split: {}/{}'.format(num_train, num_valid)) if len(args['net_hidden_dims']): net_hidden_dims = tuple([int(net_hidden_dim) for net_hidden_dim in args['net_hidden_dims'].split(',')]) else: net_hidden_dims = () predictor = set_up_predictor(method=args['method'], fp_hidden_dim=args['fp_hidden_dim'], fp_out_dim=args['fp_out_dim'], conv_layers=args['conv_layers'], concat_hidden=args['concat_hidden'], fp_dropout_rate=args['fp_dropout_rate'], fp_batch_normalization=args['fp_batch_normalization'], net_hidden_dims=net_hidden_dims, class_num=class_num, sim_method=args['sim_method'], weight_typing=args['weight_tying'], symmetric=args['symmetric'], ) train_iter = SerialIterator(train, args['batchsize']) test_iter = SerialIterator(valid, args['batchsize'], repeat=False, shuffle=False) metrics_fun = {'accuracy': F.binary_accuracy} classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=metrics_fun, device=args['gpu']) # Set up the optimizer. optimizer = optimizers.Adam(alpha=args['learning_rate'], weight_decay_rate=args['weight_decay_rate']) # optimizer = optimizers.Adam() # optimizer = optimizers.SGD(lr=args.learning_rate) optimizer.setup(classifier) # add regularization if args['max_norm'] > 0: optimizer.add_hook(chainer.optimizer.GradientClipping(threshold=args['max_norm'])) if args['l2_rate'] > 0: optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args['l2_rate'])) if args['l1_rate'] > 0: optimizer.add_hook(chainer.optimizer.Lasso(rate=args['l1_rate'])) updater = training.StandardUpdater(train_iter, optimizer, device=args['gpu'], converter=concat_mols) # Set up the trainer. logging.info('Training...') # add stop_trigger parameter early_stop = triggers.EarlyStoppingTrigger(monitor='validation/main/loss', patients=10, max_trigger=(500, 'epoch')) out = 'output' + '/' + args['out'] trainer = training.Trainer(updater, stop_trigger=early_stop, out=out) trainer.extend(E.Evaluator(test_iter, classifier, device=args['gpu'], converter=concat_mols)) train_eval_iter = SerialIterator(train, args['batchsize'], repeat=False, shuffle=False) trainer.extend(AccuracyEvaluator( train_eval_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='train_acc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend(AccuracyEvaluator( test_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='val_acc', pos_labels=1, ignore_labels=-1)) trainer.extend(ROCAUCEvaluator( train_eval_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='train_roc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend(ROCAUCEvaluator( test_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='val_roc', pos_labels=1, ignore_labels=-1)) trainer.extend(PRCAUCEvaluator( train_eval_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='train_prc', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend(PRCAUCEvaluator( test_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='val_prc', pos_labels=1, ignore_labels=-1)) trainer.extend(F1Evaluator( train_eval_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='train_f', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend(F1Evaluator( test_iter, classifier, eval_func=predictor, device=args['gpu'], converter=concat_mols, name='val_f', pos_labels=1, ignore_labels=-1)) # apply shift strategy to learning rate every 10 epochs # trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=(10, 'epoch')) if args['exp_shift_strategy']== 1: trainer.extend(E.ExponentialShift('alpha', args['exp_shift_rate']), trigger=triggers.ManualScheduleTrigger([10, 20, 30, 40, 50, 60], 'epoch')) elif args['exp_shift_strategy'] == 2: trainer.extend(E.ExponentialShift('alpha', args['exp_shift_rate']), trigger=triggers.ManualScheduleTrigger([5, 10, 15, 20, 25, 30], 'epoch')) elif args['exp_shift_strategy'] == 3: trainer.extend(E.ExponentialShift('alpha', args['exp_shift_rate']), trigger=triggers.ManualScheduleTrigger([5, 10, 15, 20, 25, 30, 40, 50, 60, 70], 'epoch')) else: raise ValueError('No such strategy to adapt learning rate') # # observation of learning rate trainer.extend(E.observe_lr(), trigger=(1, 'iteration')) entries = [ 'epoch', 'main/loss', 'train_acc/main/accuracy', 'train_roc/main/roc_auc', 'train_prc/main/prc_auc', # 'train_p/main/precision', 'train_r/main/recall', 'train_f/main/f1', 'validation/main/loss', 'val_acc/main/accuracy', 'val_roc/main/roc_auc', 'val_prc/main/prc_auc', # 'val_p/main/precision', 'val_r/main/recall', 'val_f/main/f1', 'lr', 'elapsed_time'] trainer.extend(E.PrintReport(entries=entries)) # change from 10 to 2 on Mar. 1 2019 trainer.extend(E.snapshot(), trigger=(2, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.ProgressBar()) trainer.extend(E.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', file_name='loss.png')) trainer.extend(E.PlotReport(['train_acc/main/accuracy', 'val_acc/main/accuracy'], 'epoch', file_name='accuracy.png')) if args['resume']: resume_path = os.path.join(out, args['resume']) logging.info('Resume training according to snapshot in {}'.format(resume_path)) chainer.serializers.load_npz(resume_path, trainer) trainer.run() # Save the regressor's parameters. model_path = os.path.join(out, args['model_filename']) logging.info('Saving the trained models to {}...'.format(model_path)) classifier.save_pickle(model_path, protocol=args['protocol'])
def main(): # Parse the arguments. args = parse_arguments() gpu = False if args.gpu >= 0: import cupy gpu = True if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): if gpu: return cupy.asarray(label_list, dtype=cupy.int32) else: return numpy.asarray(label_list, dtype=numpy.int32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = set_up_preprocessor(args.method, args.max_atoms) parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') train = parser.parse(args.train_datafile)['dataset'] # Validation preprocessor = set_up_preprocessor(args.method, args.max_atoms) parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') val = parser.parse(args.val_datafile)['dataset'] # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num, max_atoms=args.max_atoms) if gpu: predictor = predictor.to_gpu() # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) val_iter = SerialIterator(val, args.batchsize, repeat=False, shuffle=False) # Set up the classifier. classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(classifier) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.l2reg)) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) #metrics trainer.extend(E.Evaluator(val_iter, classifier)) train_eval_iter = SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(val_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val', pos_labels=1, ignore_labels=-1)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc', 'validation/main/loss', 'validation/main/accuracy', 'val/main/roc_auc', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # Save the classifier's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) classifier.save_pickle(model_path, protocol=args.protocol)
def main(): parser = argparse.ArgumentParser( description='Imbalanced MNIST classification') parser.add_argument('--eval-mode', type=int, default=1, help='Evaluation mode.' '0: only binary_accuracy is calculated.' '1: binary_accuracy and ROC-AUC score is calculated') parser.add_argument('--batchsize', '-b', type=int, default=100, help='batch size') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID to use. Negative value indicates ' 'not to use GPU and to run the code in CPU.') parser.add_argument('--out', '-o', type=str, default='result', help='path to output directory') parser.add_argument('--epoch', '-e', type=int, default=10, help='number of epochs') parser.add_argument('--resume', '-r', type=str, default='', help='path to a trainer snapshot') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--protocol', type=int, default=2, help='protocol version for pickle') parser.add_argument('--model-filename', type=str, default='classifier.pkl', help='file name for pickled model') parser.add_argument('--updater-type', type=str, default='standard') parser.add_argument('--sampling-size', type=int, default=32) parser.add_argument('--optimizer-type', type=str, default='Adam') parser.add_argument('--alpha', type=str, default='0.001') args = parser.parse_args() # Dataset preparation train, train_val, val = get_binary_imbalanced_data() train_iter = iterators.SerialIterator(train, args.batchsize) val_iter = iterators.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) model = LeNet(n_class=1, binary=True) classifier = Classifier(model, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) if args.optimizer_type == 'Adam': optimizer = optimizers.Adam() else: optimizer = optimizers.SGD(lr=1e-3) optimizer.setup(classifier) updater_type = args.updater_type if updater_type == 'standard': updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu) elif updater_type == 'proposed': updater = Proposed(train_iter, optimizer, device=args.gpu, sampling_size=args.sampling_size) elif updater_type == 'LRE': x, t = chainer.dataset.concat_examples(train) train_val_iter = iterators.SerialIterator(train_val, len(train_val)) updater = LRE({ 'main': train_iter, 'val': train_val_iter }, optimizer, device=args.gpu, alpha=args.alpha) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.Evaluator(val_iter, classifier, device=args.gpu)) trainer.extend(E.LogReport()) eval_mode = args.eval_mode if eval_mode == 0: trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ])) elif eval_mode == 1: train_eval_iter = iterators.SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=model, device=args.gpu, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(val_iter, classifier, eval_func=model, device=args.gpu, name='val', pos_labels=1, ignore_labels=-1)) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc', 'validation/main/loss', 'validation/main/accuracy', 'val/main/roc_auc', 'elapsed_time' ])) else: raise ValueError('Invalid accfun_mode {}'.format(eval_mode)) trainer.extend(E.ProgressBar(update_interval=10)) frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) trainer.extend(E.snapshot(), trigger=(frequency, 'epoch')) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run() classifier.save_pickle(os.path.join(args.out, args.model_filename), protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() args.out = os.path.join(args.out, args.method) save_args(args, args.out) if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label_float(label_list): return numpy.asarray(label_list, dtype=numpy.float32) def postprocess_label_int(label_list): return numpy.asarray(label_list, dtype=numpy.int64) # Apply a preprocessor to the dataset. if args.train: ## training data fn,ext = os.path.splitext(args.train) if ext==".npz": print('Loading training dataset...') train = NumpyTupleDataset.load(args.train) else: print('Preprocessing training dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') train = parser.parse(args.train)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), train) # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(train.get_datasets()[-1]) else: scaler = None ## test data fn,ext = os.path.splitext(args.val) if ext==".npz": print('Loading test dataset...') test = NumpyTupleDataset.load(args.val) else: print('Preprocessing test dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') test = parser.parse(args.val)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), test) # Set up the model. device = chainer.get_device(args.device) converter = converter_method_dict[args.method] metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} if args.classification: if args.load_model: model = Classifier.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) model = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=device) else: if args.load_model: model = Regressor.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor( args.method+args.method_suffix, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) model = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) if args.train: if args.balanced_iter: train = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1) train.show_label_stats() print('Training...') log_keys = ['main/mae','main/rmse','validation/main/mae','validation/main/rmse','validation/main/roc_auc'] extensions_list = [extensions.PlotReport(log_keys, 'iteration', trigger=(100, 'iteration'), file_name='loss.png')] if args.eval_roc and args.classification: extensions_list.append(ROCAUCEvaluator( test, model, eval_func=predictor, device=device, converter=converter, name='validation', pos_labels=1, ignore_labels=-1, raise_value_error=False)) save_json(os.path.join(args.out, 'args.json'), vars(args)) run_train(model, train, valid=test, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=extensions_list, device=device, converter=converter) #, resume_path=args.resume) # Save the model's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) if hasattr(model.predictor.graph_conv, 'reset_state'): model.predictor.graph_conv.reset_state() model.save_pickle(model_path, protocol=args.protocol) ## prediction it = SerialIterator(test, args.batchsize, repeat=False, shuffle=False) result = [] for batch in it: in_arrays = convert._call_converter(converter, batch, device) with chainer.using_config('train', False), chainer.function.no_backprop_mode(): if isinstance(in_arrays, tuple): res = model(*in_arrays) elif isinstance(in_arrays, dict): res = model(**in_arrays) else: res = model(in_arrays) result.extend(model.y.array.get()) numpy.savetxt(os.path.join(args.out,"result.csv"), numpy.array(result)) eval_result = Evaluator(it, model, converter=converter,device=device)() print('Evaluation result: ', eval_result)
def main(): # Supported preprocessing/network list method_list = [ 'nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'megnet' ] label_names = D.get_tox21_label_names() iterator_type = ['serial', 'balanced'] parser = argparse.ArgumentParser( description='Multitask Learning with Tox21.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp', help='graph convolution model to use ' 'as a predictor.') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for logistic ' 'regression. Use all labels if this option ' 'is not specified.') parser.add_argument('--iterator-type', type=str, choices=iterator_type, default='serial', help='iterator type. If `balanced` ' 'is specified, data is sampled to take same number of' 'positive/negative labels during training.') parser.add_argument('--eval-mode', type=int, default=1, help='Evaluation mode.' '0: only binary_accuracy is calculated.' '1: binary_accuracy and ROC-AUC score is calculated') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to output directory') parser.add_argument('--epoch', '-e', type=int, default=10, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--resume', '-r', type=str, default='', help='path to a trainer snapshot') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--protocol', type=int, default=2, help='protocol version for pickle') parser.add_argument('--model-filename', type=str, default='classifier.pkl', help='file name for pickled model') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None class_num = len(label_names) # Dataset preparation train, val, _ = data.load_dataset(method, labels, num_data=args.num_data) # Network predictor_ = set_up_predictor(method, args.unit_num, args.conv_layers, class_num) iterator_type = args.iterator_type if iterator_type == 'serial': train_iter = I.SerialIterator(train, args.batchsize) elif iterator_type == 'balanced': if class_num > 1: raise ValueError('BalancedSerialIterator can be used with only one' 'label classification, please specify label to' 'be predicted by --label option.') train_iter = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1) train_iter.show_label_stats() else: raise ValueError('Invalid iterator type {}'.format(iterator_type)) device = chainer.get_device(args.device) classifier = Classifier(predictor_, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=device) extensions_list = [] eval_mode = args.eval_mode converter = converter_method_dict[method] if eval_mode == 1: train_eval_iter = I.SerialIterator(train, args.batchsize, repeat=False, shuffle=False) extensions_list.append( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor_, device=device, converter=converter, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) extensions_list.append( ROCAUCEvaluator(val_iter, classifier, eval_func=predictor_, device=device, converter=converter, name='val', pos_labels=1, ignore_labels=-1)) run_train(classifier, train_iter, valid=val, batch_size=args.batchsize, epoch=args.epoch, out=args.out, device=device, converter=converter, extensions_list=extensions_list, resume_path=args.resume) # frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) # trainer.extend(E.snapshot(), trigger=(frequency, 'epoch')) # trainer.run() config = { 'method': args.method, 'conv_layers': args.conv_layers, 'unit_num': args.unit_num, 'labels': args.label } save_json(os.path.join(args.out, 'config.json'), config) classifier.save_pickle(os.path.join(args.out, args.model_filename), protocol=args.protocol)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers task_type = molnet_default_config[dataset_name]['task_type'] model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [ dataset_part_filename(p, num_data) for p in ['train', 'valid'] ] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) train, valid = dataset_parts[0], dataset_parts[1] # # Scale the label values, if necessary. # if args.scale == 'standardize': # if task_type == 'regression': # print('Applying standard scaling to the labels.') # datasets, scaler = standardize_dataset_labels(datasets) # else: # print('Label scaling is not available for classification tasks.') # else: # print('No label scaling was selected.') # scaler = None # Set up the predictor. predictor = set_up_predictor(method, n_unit, conv_layers, class_num) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = { k: v for k, v in metrics.items() if isinstance(v, types.FunctionType) } loss_fun = molnet_default_config[dataset_name]['loss'] if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) # TODO: Use standard scaler for regression task elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(model) # Save model-related output to this directory. model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend( E.Evaluator(valid_iter, model, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Report various metrics. print_report_targets = ['epoch', 'main/loss', 'validation/main/loss'] for metric_name, metric_fun in metrics.items(): if isinstance(metric_fun, types.FunctionType): print_report_targets.append('main/' + metric_name) print_report_targets.append('validation/main/' + metric_name) elif issubclass(metric_fun, BatchEvaluator): trainer.extend( metric_fun(valid_iter, model, device=args.gpu, eval_func=predictor, converter=concat_mols, name='val', raise_value_error=False)) print_report_targets.append('val/main/' + metric_name) else: raise TypeError('{} is not a supported metrics function.'.format( type(metrics_fun))) print_report_targets.append('elapsed_time') # Augmented by Ishiguro # ToDo: consider go/no-go of the following block # (i) more reporting for val/evalutaion # (ii) best validation score snapshot if task_type == 'regression': if 'RMSE' in metric_name: trainer.extend(E.snapshot_object( model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger( 'validation/main/RMSE')) elif 'MAE' in metric_name: trainer.extend(E.snapshot_object( model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger( 'validation/main/MAE')) else: print("No validation metric defined?") assert (False) elif task_type == 'classification': train_eval_iter = iterators.SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( ROCAUCEvaluator(train_eval_iter, predictor, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(valid_iter, predictor, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val', pos_labels=1, ignore_labels=-1)) print_report_targets.append('train/main/roc_auc') print_report_targets.append('validation/main/loss') print_report_targets.append('val/main/roc_auc') trainer.extend( E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) else: raise NotImplementedError( 'Not implemented task_type = {}'.format(task_type)) trainer.extend(E.PrintReport(print_report_targets)) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol)