def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor( args.method, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training...') run_train(regressor, train, valid=None, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=concat_mols, resume_path=None) # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) # TODO(nakago): ChainerX array cannot be sent to numpy array when internal # state has gradients. if hasattr(regressor.predictor.graph_conv, 'reset_state'): regressor.predictor.graph_conv.reset_state() regressor.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) # Set up the regressor. metrics_fun = { 'mean_abs_error': MeanAbsError(scaler=scaler), 'root_mean_sqr_error': RootMeanSqrError(scaler=scaler) } regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/mean_abs_error', 'main/root_mean_sqr_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol) # Save the standard scaler's parameters. if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=args.protocol)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser( description='Regression with own dataset.') parser.add_argument('--datafile', type=str, default='dataset.csv') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', nargs='+', default=['value1', 'value2'], help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--protocol', type=int, default=2) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: sys.exit("Error: No target label is specified.") # Dataset preparation # Postprocess is required for regression task def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)["dataset"] if args.scale == 'standardize': # Standard Scaler for labels scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) else: # Not use scaler scaler = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = iterators.SerialIterator(train, args.batchsize) val_iter = iterators.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) regressor = Regressor( model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': ScaledAbsError(scaler=scaler)}, device=args.gpu) optimizer = optimizers.Adam() optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, regressor, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Note that original scale absolute errors are reported in # (validation/)main/abs_error trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- save regressor's parameters --- protocol = args.protocol model_path = os.path.join(args.out, 'model.npz') print('saving trained model to {}'.format(model_path)) serializers.save_npz(model_path, regressor) if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=protocol) # Example of prediction using trained model smiles = 'c1ccccc1' mol = Chem.MolFromSmiles(smiles) preprocessor = preprocess_method_dict[method]() standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol) input_features = preprocessor.get_input_features(mol) atoms, adjs = concat_mols([input_features], device=args.gpu) prediction = model(atoms, adjs).data[0] if scaler is not None: prediction = scaler.inverse_transform(prediction) print('Prediction for {}:'.format(smiles)) for i, label in enumerate(args.label): print('{}: {}'.format(label, prediction[i]))
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,))) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) # Set up the regressor. metrics_fun = {'mean_abs_error': MeanAbsError(scaler=scaler), 'root_mean_sqr_error': RootMeanSqrError(scaler=scaler)} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/mean_abs_error', 'main/root_mean_sqr_error', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol) # Save the standard scaler's parameters. if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=args.protocol)
def model(): return Regressor(links.Linear(None, output_dim))
def main(): # Parse the arguments. args = parse_arguments() args.model_folder_name = os.path.join(theme_name, 'chainer') base_epoch = complexity_degree[high_low] args.epoch = int(base_epoch * 60 / method_complexity[method_name]) args.epoch = max(args.epoch, 5) #args.epoch = int(float(t_epochs.get())) args.out = parent_path / 'models' / theme_name / method_name / high_low args.method = method_name if t_model_path != "": args.source_transferlearning = Path(t_model_path.get()) print(theme_name) if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() smiles_col_name = t_smiles.get() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col=smiles_col_name) args.datafile = t_csv_filepath.get() dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) trainset, testset = split_dataset_random(dataset, train_data_size, args.seed) print((args.source_transferlearning / method_name / high_low / 'regressor.pickle')) print((args.source_transferlearning / method_name / high_low / 'regressor.pickle').exists()) # Set up the predictor. if Booleanvar_transfer_learning.get() == True \ and (args.source_transferlearning / method_name / high_low /'regressor.pickle').exists() == True: # refer https://github.com/pfnet-research/chainer-chemistry/issues/407 with open( args.source_transferlearning / method_name / high_low / 'regressor.pickle', 'rb') as f: regressor = cloudpickle.loads(f.read()) pre_predictor = regressor.predictor predictor = GraphConvPredictor(pre_predictor.graph_conv, MLP(out_dim=1, hidden_dim=16)) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': functions.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=functions.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training... : ', method_name) run_train(regressor, trainset, valid=None, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=concat_mols, resume_path=None) # Save the regressor's parameters. args.model_foldername = t_theme_name.get() model_path = os.path.join(args.out, args.model_foldername, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) # TODO(nakago): ChainerX array cannot be sent to numpy array when internal # state has gradients. if hasattr(regressor.predictor.graph_conv, 'reset_state'): regressor.predictor.graph_conv.reset_state() with open( parent_path / 'models' / theme_name / method_name / high_low / ('regressor.pickle'), 'wb') as f: cloudpickle.dump(regressor, f) #with open(parent_path / 'models' / theme_name / method_name / high_low /('predictor.pickle'), 'wb') as f: # cloudpickle.dump(predictor, f) print('Evaluating... : ', method_name) test_iterator = SerialIterator(testset, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=device)() print('Evaluation result: : ', method_name) print(eval_result) @chainer.dataset.converter() def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] pred_train = regressor.predict(trainset, converter=extract_inputs) pred_train = [i[0] for i in pred_train] pred_test = regressor.predict(testset, converter=extract_inputs) pred_test = [i[0] for i in pred_test] y_train = [i[2][0] for i in trainset] y_test = [i[2][0] for i in testset] title = args.label save_path = parent_path / 'results' / theme_name / method_name / high_low / 'scatter.png' save_scatter(y_train, pred_train, y_test, pred_test, title, save_path) global image_score image_score_open = Image.open(parent_path / 'results' / theme_name / method_name / high_low / 'scatter.png') image_score = ImageTk.PhotoImage(image_score_open, master=frame1) canvas.create_image(200, 200, image=image_score) from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import r2_score train_mse = mean_squared_error(y_train, pred_train) test_mse = mean_squared_error(y_test, pred_test) train_rmse = np.sqrt(train_mse) test_rmse = np.sqrt(test_mse) train_mae = mean_absolute_error(y_train, pred_train) test_mae = mean_absolute_error(y_test, pred_test) train_r2score = r2_score(y_train, pred_train) test_r2score = r2_score(y_test, pred_test) print('train_mse : ', train_mse) print('test_mse : ', test_mse) print('train_rmse : ', train_rmse) print('test_rmse : ', test_rmse) print('train_mae : ', train_mae) print('test_mae : ', train_mae) print('train_r2score : ', train_r2score) print('test_r2score : ', test_r2score)
def main(): # Parse the arguments. args = parse_arguments() args.out = os.path.join(args.out, args.method) save_args(args, args.out) if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label_float(label_list): return numpy.asarray(label_list, dtype=numpy.float32) def postprocess_label_int(label_list): return numpy.asarray(label_list, dtype=numpy.int64) # Apply a preprocessor to the dataset. if args.train: ## training data fn,ext = os.path.splitext(args.train) if ext==".npz": print('Loading training dataset...') train = NumpyTupleDataset.load(args.train) else: print('Preprocessing training dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') train = parser.parse(args.train)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), train) # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(train.get_datasets()[-1]) else: scaler = None ## test data fn,ext = os.path.splitext(args.val) if ext==".npz": print('Loading test dataset...') test = NumpyTupleDataset.load(args.val) else: print('Preprocessing test dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') test = parser.parse(args.val)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), test) # Set up the model. device = chainer.get_device(args.device) converter = converter_method_dict[args.method] metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} if args.classification: if args.load_model: model = Classifier.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) model = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=device) else: if args.load_model: model = Regressor.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor( args.method+args.method_suffix, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) model = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) if args.train: if args.balanced_iter: train = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1) train.show_label_stats() print('Training...') log_keys = ['main/mae','main/rmse','validation/main/mae','validation/main/rmse','validation/main/roc_auc'] extensions_list = [extensions.PlotReport(log_keys, 'iteration', trigger=(100, 'iteration'), file_name='loss.png')] if args.eval_roc and args.classification: extensions_list.append(ROCAUCEvaluator( test, model, eval_func=predictor, device=device, converter=converter, name='validation', pos_labels=1, ignore_labels=-1, raise_value_error=False)) save_json(os.path.join(args.out, 'args.json'), vars(args)) run_train(model, train, valid=test, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=extensions_list, device=device, converter=converter) #, resume_path=args.resume) # Save the model's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) if hasattr(model.predictor.graph_conv, 'reset_state'): model.predictor.graph_conv.reset_state() model.save_pickle(model_path, protocol=args.protocol) ## prediction it = SerialIterator(test, args.batchsize, repeat=False, shuffle=False) result = [] for batch in it: in_arrays = convert._call_converter(converter, batch, device) with chainer.using_config('train', False), chainer.function.no_backprop_mode(): if isinstance(in_arrays, tuple): res = model(*in_arrays) elif isinstance(in_arrays, dict): res = model(**in_arrays) else: res = model(in_arrays) result.extend(model.y.array.get()) numpy.savetxt(os.path.join(args.out,"result.csv"), numpy.array(result)) eval_result = Evaluator(it, model, converter=converter,device=device)() print('Evaluation result: ', eval_result)