def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor( args.method, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training...') run_train(regressor, train, valid=None, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=concat_mols, resume_path=None) # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) # TODO(nakago): ChainerX array cannot be sent to numpy array when internal # state has gradients. if hasattr(regressor.predictor.graph_conv, 'reset_state'): regressor.predictor.graph_conv.reset_state() regressor.save_pickle(model_path, protocol=args.protocol)
def test_csv_file_parser_target_index(csv_file_invalid, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file_invalid, return_smiles=True, target_index=[1, 2, 4], return_is_successful=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 2 is_successful = result['is_successful'] assert numpy.array_equal(is_successful, numpy.array([True, False, True])) assert len(is_successful) == 3 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. expect = preprocessor.get_input_features(mols[0]) check_features(dataset[0], expect, label_a[0]) expect = preprocessor.get_input_features(mols[2]) check_features(dataset[1], expect, label_a[2]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'CC1=CC2CC(CC1)O2'
def test_csv_file_parser_target_index(csv_file_invalid, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file_invalid, return_smiles=True, target_index=[1, 2, 4], return_is_successful=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 2 is_successful = result['is_successful'] assert numpy.array_equal(is_successful, numpy.array([True, False, True])) assert len(is_successful) == 3 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. expect = preprocessor.get_input_features(mols[0]) check_features(dataset[0], expect, label_a[0]) expect = preprocessor.get_input_features(mols[2]) check_features(dataset[1], expect, label_a[2]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'CC1=CC2CC(CC1)O2'
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label else: raise ValueError('No target label was specified.') # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Load the standard scaler parameters, if necessary. if args.scale == 'standardize': with open(os.path.join(args.in_dir, 'scaler.pkl'), mode='rb') as f: scaler = pickle.load(f) else: scaler = None test = dataset print('Predicting...') # Set up the regressor. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) scaled_predictor = ScaledGraphConvPredictor(regressor.predictor) scaled_predictor.scaler = scaler regressor.predictor = scaled_predictor # Perform the prediction. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() # Prevents the loss function from becoming a cupy.core.core.ndarray object # when using the GPU. This hack will be removed as soon as the cause of # the issue is found and properly fixed. loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss'])) eval_result['main/loss'] = loss print('Evaluation result: ', eval_result) with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f)
def test_csv_file_parser(csv_file, mols): preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, smiles_col='smiles') dataset = parser.parse(csv_file) assert len(dataset) == 2 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. expect = preprocessor.get_input_features(mols[0]) check_input_features(dataset[0], expect) expect = preprocessor.get_input_features(mols[1]) check_input_features(dataset[1], expect)
def test_csv_file_parser_not_return_smiles(csv_file, mols): preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, smiles_col='smiles') # Actually, `dataset, smiles = parser.parse(..)` is enough. result = parser.parse(csv_file, return_smiles=False) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 assert smiles is None # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect)
def test_csv_file_parser_not_return_smiles(csv_file, mols): preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, smiles_col='smiles') # Actually, `dataset, smiles = parser.parse(..)` is enough. result = parser.parse(csv_file, return_smiles=False) dataset = result['dataset'] smiles = result['smiles'] is_successful = result['is_successful'] assert len(dataset) == 3 assert smiles is None assert is_successful is None # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect)
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label else: raise ValueError('No target label was specified.') # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Load the standard scaler parameters, if necessary. if args.scale == 'standardize': with open(os.path.join(args.in_dir, 'scaler.pkl'), mode='rb') as f: scaler = pickle.load(f) else: scaler = None test = dataset print('Predicting...') # Set up the regressor. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) scaled_predictor = ScaledGraphConvPredictor(regressor.predictor) scaled_predictor.scaler = scaler regressor.predictor = scaled_predictor # Perform the prediction. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f)
def test_csv_parser_return_is_successful(csv_file_invalid, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file_invalid, return_smiles=True, return_is_successful=True) dataset = result['dataset'] # smiles = result['smiles'] assert len(dataset) == 3 is_successful = result['is_successful'] assert len(is_successful) == 5 # print('is_successful', is_successful) assert numpy.alltrue(is_successful[[1, 3, 4]]) assert numpy.alltrue(~is_successful[[0, 2]]) # We assume NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i])
def test_csv_parser_return_is_successful(csv_file_invalid, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file_invalid, return_smiles=True, return_is_successful=True) dataset = result['dataset'] # smiles = result['smiles'] assert len(dataset) == 3 is_successful = result['is_successful'] assert len(is_successful) == 5 # print('is_successful', is_successful) assert numpy.alltrue(is_successful[[1, 3, 4]]) assert numpy.alltrue(~is_successful[[0, 2]]) # We assume NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i])
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label else: raise ValueError('No target label was specified.') # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] test = dataset print('Predicting...') # Set up the regressor. device = chainer.get_device(args.device) model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=device) # Perform the prediction. print('Evaluating...') converter = converter_method_dict[args.method] test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=converter, device=device)() print('Evaluation result: ', eval_result) save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)
def test_csv_file_parser_return_smiles(csv_file, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file, return_smiles=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'Cc1ccccc1' assert smiles[2] == 'CC1=CC2CC(CC1)O2'
def test_csv_file_parser_retain_smiles(csv_file, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') dataset = parser.parse(csv_file, retain_smiles=True) smiles = parser.get_smiles() assert len(dataset) == 2 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. expect0 = preprocessor.get_input_features(mols[0]) check_features(dataset[0], expect0, label_a[0]) expect1 = preprocessor.get_input_features(mols[1]) check_features(dataset[1], expect1, label_a[1]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'Cc1ccccc1'
def test_csv_file_parser_return_smiles(csv_file, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file, return_smiles=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'Cc1ccccc1' assert smiles[2] == 'CC1=CC2CC(CC1)O2'
def test_csv_file_parser_extract_total_num(csv_file): preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') num = parser.extract_total_num(csv_file) assert num == 3
def main(): # Parse the arguments. args = parse_arguments() args.out = os.path.join(args.out, args.method) save_args(args, args.out) if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label_float(label_list): return numpy.asarray(label_list, dtype=numpy.float32) def postprocess_label_int(label_list): return numpy.asarray(label_list, dtype=numpy.int64) # Apply a preprocessor to the dataset. if args.train: ## training data fn,ext = os.path.splitext(args.train) if ext==".npz": print('Loading training dataset...') train = NumpyTupleDataset.load(args.train) else: print('Preprocessing training dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') train = parser.parse(args.train)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), train) # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(train.get_datasets()[-1]) else: scaler = None ## test data fn,ext = os.path.splitext(args.val) if ext==".npz": print('Loading test dataset...') test = NumpyTupleDataset.load(args.val) else: print('Preprocessing test dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') test = parser.parse(args.val)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), test) # Set up the model. device = chainer.get_device(args.device) converter = converter_method_dict[args.method] metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} if args.classification: if args.load_model: model = Classifier.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) model = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=device) else: if args.load_model: model = Regressor.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor( args.method+args.method_suffix, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) model = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) if args.train: if args.balanced_iter: train = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1) train.show_label_stats() print('Training...') log_keys = ['main/mae','main/rmse','validation/main/mae','validation/main/rmse','validation/main/roc_auc'] extensions_list = [extensions.PlotReport(log_keys, 'iteration', trigger=(100, 'iteration'), file_name='loss.png')] if args.eval_roc and args.classification: extensions_list.append(ROCAUCEvaluator( test, model, eval_func=predictor, device=device, converter=converter, name='validation', pos_labels=1, ignore_labels=-1, raise_value_error=False)) save_json(os.path.join(args.out, 'args.json'), vars(args)) run_train(model, train, valid=test, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=extensions_list, device=device, converter=converter) #, resume_path=args.resume) # Save the model's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) if hasattr(model.predictor.graph_conv, 'reset_state'): model.predictor.graph_conv.reset_state() model.save_pickle(model_path, protocol=args.protocol) ## prediction it = SerialIterator(test, args.batchsize, repeat=False, shuffle=False) result = [] for batch in it: in_arrays = convert._call_converter(converter, batch, device) with chainer.using_config('train', False), chainer.function.no_backprop_mode(): if isinstance(in_arrays, tuple): res = model(*in_arrays) elif isinstance(in_arrays, dict): res = model(**in_arrays) else: res = model(in_arrays) result.extend(model.y.array.get()) numpy.savetxt(os.path.join(args.out,"result.csv"), numpy.array(result)) eval_result = Evaluator(it, model, converter=converter,device=device)() print('Evaluation result: ', eval_result)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser( description='Regression with own dataset.') parser.add_argument('--datafile', type=str, default='dataset.csv') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', nargs='+', default=['value1', 'value2'], help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--protocol', type=int, default=2) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: sys.exit("Error: No target label is specified.") # Dataset preparation # Postprocess is required for regression task def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)["dataset"] if args.scale == 'standardize': # Standard Scaler for labels scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) else: # Not use scaler scaler = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = iterators.SerialIterator(train, args.batchsize) val_iter = iterators.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) regressor = Regressor( model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': ScaledAbsError(scaler=scaler)}, device=args.gpu) optimizer = optimizers.Adam() optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, regressor, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Note that original scale absolute errors are reported in # (validation/)main/abs_error trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- save regressor's parameters --- protocol = args.protocol model_path = os.path.join(args.out, 'model.npz') print('saving trained model to {}'.format(model_path)) serializers.save_npz(model_path, regressor) if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=protocol) # Example of prediction using trained model smiles = 'c1ccccc1' mol = Chem.MolFromSmiles(smiles) preprocessor = preprocess_method_dict[method]() standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol) input_features = preprocessor.get_input_features(mol) atoms, adjs = concat_mols([input_features], device=args.gpu) prediction = model(atoms, adjs).data[0] if scaler is not None: prediction = scaler.inverse_transform(prediction) print('Prediction for {}:'.format(smiles)) for i, label in enumerate(args.label): print('{}: {}'.format(label, prediction[i]))
def main(): # Parse the arguments. args = parse_arguments() args.model_folder_name = os.path.join(theme_name, 'chainer') base_epoch = complexity_degree[high_low] args.epoch = int(base_epoch * 60 / method_complexity[method_name]) args.epoch = max(args.epoch, 5) #args.epoch = int(float(t_epochs.get())) args.out = parent_path / 'models' / theme_name / method_name / high_low args.method = method_name if t_model_path != "": args.source_transferlearning = Path(t_model_path.get()) print(theme_name) if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() smiles_col_name = t_smiles.get() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col=smiles_col_name) args.datafile = t_csv_filepath.get() dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) trainset, testset = split_dataset_random(dataset, train_data_size, args.seed) print((args.source_transferlearning / method_name / high_low / 'regressor.pickle')) print((args.source_transferlearning / method_name / high_low / 'regressor.pickle').exists()) # Set up the predictor. if Booleanvar_transfer_learning.get() == True \ and (args.source_transferlearning / method_name / high_low /'regressor.pickle').exists() == True: # refer https://github.com/pfnet-research/chainer-chemistry/issues/407 with open( args.source_transferlearning / method_name / high_low / 'regressor.pickle', 'rb') as f: regressor = cloudpickle.loads(f.read()) pre_predictor = regressor.predictor predictor = GraphConvPredictor(pre_predictor.graph_conv, MLP(out_dim=1, hidden_dim=16)) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': functions.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=functions.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training... : ', method_name) run_train(regressor, trainset, valid=None, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=concat_mols, resume_path=None) # Save the regressor's parameters. args.model_foldername = t_theme_name.get() model_path = os.path.join(args.out, args.model_foldername, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) # TODO(nakago): ChainerX array cannot be sent to numpy array when internal # state has gradients. if hasattr(regressor.predictor.graph_conv, 'reset_state'): regressor.predictor.graph_conv.reset_state() with open( parent_path / 'models' / theme_name / method_name / high_low / ('regressor.pickle'), 'wb') as f: cloudpickle.dump(regressor, f) #with open(parent_path / 'models' / theme_name / method_name / high_low /('predictor.pickle'), 'wb') as f: # cloudpickle.dump(predictor, f) print('Evaluating... : ', method_name) test_iterator = SerialIterator(testset, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=device)() print('Evaluation result: : ', method_name) print(eval_result) @chainer.dataset.converter() def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] pred_train = regressor.predict(trainset, converter=extract_inputs) pred_train = [i[0] for i in pred_train] pred_test = regressor.predict(testset, converter=extract_inputs) pred_test = [i[0] for i in pred_test] y_train = [i[2][0] for i in trainset] y_test = [i[2][0] for i in testset] title = args.label save_path = parent_path / 'results' / theme_name / method_name / high_low / 'scatter.png' save_scatter(y_train, pred_train, y_test, pred_test, title, save_path) global image_score image_score_open = Image.open(parent_path / 'results' / theme_name / method_name / high_low / 'scatter.png') image_score = ImageTk.PhotoImage(image_score_open, master=frame1) canvas.create_image(200, 200, image=image_score) from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import r2_score train_mse = mean_squared_error(y_train, pred_train) test_mse = mean_squared_error(y_test, pred_test) train_rmse = np.sqrt(train_mse) test_rmse = np.sqrt(test_mse) train_mae = mean_absolute_error(y_train, pred_train) test_mae = mean_absolute_error(y_test, pred_test) train_r2score = r2_score(y_train, pred_train) test_r2score = r2_score(y_test, pred_test) print('train_mse : ', train_mse) print('test_mse : ', test_mse) print('train_rmse : ', train_rmse) print('test_rmse : ', test_rmse) print('train_mae : ', train_mae) print('test_mae : ', train_mae) print('train_r2score : ', train_r2score) print('test_r2score : ', test_r2score)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser( description='Regression with own dataset.') parser.add_argument('datafile', type=str) parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', nargs='+', help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: sys.exit("Error: No target label is specified.") # Dataset preparation # Postprocess is required for regression task def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)["dataset"] if args.scale == 'standardize': # Standard Scaler for labels ss = StandardScaler() labels = ss.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,))) train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor(NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor(GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = I.SerialIterator(train, args.batchsize) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) def scaled_abs_error(x0, x1): if isinstance(x0, Variable): x0 = cuda.to_cpu(x0.data) if isinstance(x1, Variable): x1 = cuda.to_cpu(x1.data) if args.scale == 'standardize': scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0)) scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1)) diff = scaled_x0 - scaled_x1 elif args.scale == 'none': diff = cuda.to_cpu(x0) - cuda.to_cpu(x1) return numpy.mean(numpy.absolute(diff), axis=0)[0] classifier = L.Classifier(model, lossfun=F.mean_squared_error, accfun=scaled_abs_error) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() classifier.to_gpu() optimizer = O.Adam() optimizer.setup(classifier) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Note that scaled errors are reported as (validation/)main/accuracy trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Example of prediction using trained model smiles = 'c1ccccc1' mol = Chem.MolFromSmiles(smiles) preprocessor = preprocess_method_dict[method]() standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol) input_features = preprocessor.get_input_features(mol) atoms, adjs = concat_mols([input_features], device=args.gpu) prediction = model(atoms, adjs).data[0] print('Prediction for {}:'.format(smiles)) for i, label in enumerate(args.label): print('{}: {}'.format(label, prediction[i]))
def main(): # Parse the arguments. args = parse_arguments() theme_name = t_theme_name.get() args.model_folder_name = os.path.join(theme_name, 'chainer') #args.epoch = int(float(t_epochs.get())) args.out = parent_path / 'models' / theme_name / method_name args.method = method_name if args.label: labels = args.label else: raise ValueError('No target label was specified.') # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) smiles_col_name = t_smiles.get() print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col=t_smiles.get()) #args.datafile=parent_path / 'results' / theme_name / method_name / high_low /'brics_virtual' / 'virtual.csv' args.datafile = csv_path dataset = parser.parse(args.datafile)['dataset'] @chainer.dataset.converter() def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] print('Predicting the virtual library') # Set up the regressor. device = chainer.get_device(args.device) model_path = os.path.join(args.out, args.model_foldername, args.model_filename) with open( parent_path / 'models' / theme_name / method_name / high_low / ('regressor.pickle'), 'rb') as f: regressor = cloudpickle.loads(f.read()) # Perform the prediction. print('Evaluating...') converter = converter_method_dict[args.method] data_iterator = SerialIterator(dataset, 16, repeat=False, shuffle=False) eval_result = Evaluator(data_iterator, regressor, converter=converter, device=device)() print('Evaluation result: ', eval_result) predict_ = regressor.predict(dataset, converter=extract_inputs) predict_ = [i[0] for i in predict_] df_data = pd.read_csv(csv_path) df_predict = df_data df_predict[t_task.get()] = predict_ df_predict = df_predict.dropna() PandasTools.AddMoleculeColumnToFrame(frame=df_predict, smilesCol=t_smiles.get()) df_predict['sascore'] = df_predict.ROMol.map(sascorer.calculateScore) df_predict.to_csv(csv_path) png_generator = (parent_path / 'results' / theme_name / method_name / high_low / data_name / 'molecular-structure').glob('*.png') #png_generator.sort() for i, png_path in enumerate(png_generator): #print((png_path.name)[4:10]) i = int((png_path.name)[4:10]) if i < len(df_predict[t_task.get()]): img = Image.open(png_path) draw = ImageDraw.Draw(img) font = ImageFont.truetype('arial.ttf', 26) draw.text((0, 0), t_task.get() + ' : ' + str(round(df_predict[t_task.get()][i], 2)), (0, 0, 0), font=font) draw.text( (0, 30), 'sascore : ' + str(round(df_predict['sascore'][i], 2)), (0, 0, 0), font=font) img.save(png_path) save_json(os.path.join(args.out, 'eval_result.json'), eval_result)
def test_csv_file_parser_extract_total_num(csv_file): preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') num = parser.extract_total_num(csv_file) assert num == 3
def main(): # Parse the arguments. args = parse_arguments() gpu = False if args.gpu >= 0: import cupy gpu = True if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): if gpu: return cupy.asarray(label_list, dtype=cupy.int32) else: return numpy.asarray(label_list, dtype=numpy.int32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = set_up_preprocessor(args.method, args.max_atoms) parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') train = parser.parse(args.train_datafile)['dataset'] # Validation preprocessor = set_up_preprocessor(args.method, args.max_atoms) parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') val = parser.parse(args.val_datafile)['dataset'] # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num, max_atoms=args.max_atoms) if gpu: predictor = predictor.to_gpu() # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) val_iter = SerialIterator(val, args.batchsize, repeat=False, shuffle=False) # Set up the classifier. classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(classifier) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.l2reg)) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) #metrics trainer.extend(E.Evaluator(val_iter, classifier)) train_eval_iter = SerialIterator(train, args.batchsize, repeat=False, shuffle=False) trainer.extend( ROCAUCEvaluator(train_eval_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend( ROCAUCEvaluator(val_iter, classifier, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val', pos_labels=1, ignore_labels=-1)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc', 'validation/main/loss', 'validation/main/accuracy', 'val/main/roc_auc', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # Save the classifier's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) classifier.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) # Set up the regressor. metrics_fun = { 'mean_abs_error': MeanAbsError(scaler=scaler), 'root_mean_sqr_error': RootMeanSqrError(scaler=scaler) } regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/mean_abs_error', 'main/root_mean_sqr_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol) # Save the standard scaler's parameters. if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,))) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) # Set up the regressor. metrics_fun = {'mean_abs_error': MeanAbsError(scaler=scaler), 'root_mean_sqr_error': RootMeanSqrError(scaler=scaler)} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/mean_abs_error', 'main/root_mean_sqr_error', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol) # Save the standard scaler's parameters. if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=args.protocol)