def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Load the standard scaler parameters, if necessary. if args.scale == 'standardize': scaler_path = os.path.join(args.in_dir, 'scaler.pkl') print('Loading scaler parameters from {}.'.format(scaler_path)) with open(scaler_path, mode='rb') as f: scaler = pickle.load(f) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) # Replace the default predictor with one that scales the output labels. scaled_predictor = ScaledGraphConvPredictor(regressor.predictor) scaled_predictor.scaler = scaler regressor.predictor = scaled_predictor # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] n_eval = 10 # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() # Prevents the loss function from becoming a cupy.core.core.ndarray object # when using the GPU. This hack will be removed as soon as the cause of # the issue is found and properly fixed. loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss'])) eval_result['main/loss'] = loss print('Evaluation result: ', eval_result) # Save the evaluation results. with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) scaler = regressor.predictor.scaler if scaler is not None: scaled_t = scaler.transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t, ))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] original_t = scaler.inverse_transform(t) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format( label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': print('Applying standard scaling to the labels.') scaler = StandardScaler() scaled_t = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Set up the regressor. device = args.gpu metrics_fun = {'mae': MeanAbsError(scaler=scaler), 'rmse': RootMeanSqrError(scaler=scaler)} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.Evaluator(valid_iter, regressor, device=device, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport([ 'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss', 'validation/main/mae', 'validation/main/rmse', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': print('Fit StandardScaler to the labels.') scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training...') run_train(regressor, train, valid=valid, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=concat_mols, resume_path=None) # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--in-dir', '-i', type=str, default='result') parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) # class_num = len(labels) if isinstance(labels, list) else 1 else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # class_num = len(labels) # Dataset preparation dataset = None num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' dataset_cache_path = os.path.join(cache_dir, dataset_filename) if os.path.exists(dataset_cache_path): print('load from cache {}'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) if args.scale == 'standardize': # Standard Scaler for labels with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f: ss = pickle.load(f) else: ss = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) regressor = Regressor.load_pickle(os.path.join(args.in_dir, args.model_filename), device=args.gpu) # type: Regressor # We need to feed only input features `x` to `predict`/`predict_proba`. # This converter extracts only inputs (x1, x2, ...) from the features which # consist of input `x` and label `t` (x1, x2, ..., t). def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if ss is not None: # Model's output is scaled by StandardScaler, # so we need to rescale back. if isinstance(x, Variable): x = x.data scaled_x = ss.inverse_transform(cuda.to_cpu(x)) return scaled_x else: return x print('Predicting...') y_pred = regressor.predict(val, converter=extract_inputs, postprocess_fn=postprocess_fn) print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format( y_pred.shape, y_pred[:5, 0])) t = concat_mols(val, device=-1)[-1] n_eval = 10 # Construct dataframe df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show random 5 example's prediction/ground truth table print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # --- evaluate --- # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator` print('Evaluating...') val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False) eval_result = Evaluator(val_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Dataset preparation dataset = None if os.path.exists(cache_dir): print('load from cache {}'.format(cache_dir)) dataset = NumpyTupleDataset.load(os.path.join(cache_dir, 'data.npz')) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) os.makedirs(cache_dir) NumpyTupleDataset.save(os.path.join(cache_dir, 'data.npz'), dataset) if args.scale == 'standardize': # Standard Scaler for labels ss = StandardScaler() labels = ss.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*dataset.get_datasets()[:-1], labels) train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = I.SerialIterator(train, args.batchsize) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) def scaled_abs_error(x0, x1): if isinstance(x0, Variable): x0 = cuda.to_cpu(x0.data) if isinstance(x1, Variable): x1 = cuda.to_cpu(x1.data) if args.scale == 'standardize': scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0)) scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1)) diff = scaled_x0 - scaled_x1 elif args.scale == 'none': diff = cuda.to_cpu(x0) - cuda.to_cpu(x1) return numpy.mean(numpy.absolute(diff), axis=0)[0] classifier = L.Classifier(model, lossfun=F.mean_squared_error, accfun=scaled_abs_error) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() classifier.to_gpu() optimizer = O.Adam() optimizer.setup(classifier) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run()
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--protocol', type=int, default=2) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Dataset preparation dataset = None num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' dataset_cache_path = os.path.join(cache_dir, dataset_filename) if os.path.exists(dataset_cache_path): print('load from cache {}'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # only use first 100 for debug target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: dataset = D.get_qm9(preprocessor, labels=labels) os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) if args.scale == 'standardize': # Standard Scaler for labels ss = StandardScaler() labels = ss.fit_transform(dataset.get_datasets()[-1]) else: ss = None dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = I.SerialIterator(train, args.batchsize) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) regressor = Regressor( model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': ScaledAbsError(scale=args.scale, ss=ss)}, device=args.gpu) optimizer = O.Adam() optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, regressor, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- save regressor & standardscaler --- protocol = args.protocol regressor.save_pickle(os.path.join(args.out, args.model_filename), protocol=protocol) if args.scale == 'standardize': with open(os.path.join(args.out, 'ss.pkl'), mode='wb') as f: pickle.dump(ss, f, protocol=protocol)
def main(): # Parse the arguments. args = parse_arguments() device = args.gpu # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=device) scaler = regressor.predictor.scaler if scaler is not None: original_t = dataset.get_datasets()[-1] if args.gpu >= 0: scaled_t = cuda.to_cpu(scaler.transform( cuda.to_gpu(original_t))) else: scaled_t = scaler.transform(original_t) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict( test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=device)[-1] original_t = cuda.to_cpu(scaler.inverse_transform(t)) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}' .format(label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=device)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': print('Applying standard scaling to the labels.') scaler = StandardScaler() scaled_t = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Set up the regressor. device = args.gpu metrics_fun = {'mae': MeanAbsError(scaler=scaler), 'rmse': RootMeanSqrError(scaler=scaler)} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.Evaluator(valid_iter, regressor, device=device, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport([ 'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss', 'validation/main/mae', 'validation/main/rmse', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol)