def download_entire_dataset(dataset_name, num_data, labels, method, cache_dir): """Downloads the train/valid/test parts of a dataset and stores them in the cache directory. Args: dataset_name: Dataset to be downloaded. num_data: Amount of data samples to be parsed from the dataset. labels: Target labels for regression. method: Method name. See `parse_arguments`. cache_dir: Directory to store the dataset to. """ print('Downloading {}...'.format(dataset_name)) preprocessor = preprocess_method_dict[method]() # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) if num_data >= 0 else None dataset_parts = D.molnet.get_molnet_dataset(dataset_name, preprocessor, labels=labels, target_index=target_index) dataset_parts = dataset_parts['dataset'] # Cache the downloaded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) for i, part in enumerate(['train', 'valid', 'test']): filename = dataset_part_filename(part, num_data) path = os.path.join(cache_dir, filename) NumpyTupleDataset.save(path, dataset_parts[i]) return dataset_parts
def test_save_load(self, data): tmp_cache_path = os.path.join(tempfile.mkdtemp(), 'tmp.npz') dataset = NumpyTupleDataset(*data) NumpyTupleDataset.save(tmp_cache_path, dataset) assert os.path.exists(tmp_cache_path) load_dataset = NumpyTupleDataset.load(tmp_cache_path) os.remove(tmp_cache_path) assert len(dataset._datasets) == len(load_dataset._datasets) for a, d in six.moves.zip(dataset._datasets, load_dataset._datasets): numpy.testing.assert_array_equal(a, d)
def fit_scaler(datasets): """Standardizes (scales) the dataset labels. Args: datasets: Tuple containing the datasets. Returns: Datasets with standardized labels and the scaler object. """ scaler = StandardScaler() # Collect all labels in order to apply scaling over the entire dataset. labels = None offsets = [] for dataset in datasets: if labels is None: labels = dataset.get_datasets()[-1] else: labels = numpy.vstack([labels, dataset.get_datasets()[-1]]) offsets.append(len(labels)) scaler.fit(labels) scaled_datasets = [] for dataset in datasets: x, y = dataset.get_datasets() yy = scaler.transform(y) scaled_datasets.append(NumpyTupleDataset(x, yy)) return scaler, scaled_datasets
def test_get_item_integer_index(self, data, index): dataset = NumpyTupleDataset(*data) actual = dataset[index] assert len(actual) == len(data) for a, d in six.moves.zip(actual, data): numpy.testing.assert_array_equal(a, d[index])
def augment_dataset(dataset): dataset_tuple = dataset.get_datasets() feat1, feat2, labels = dataset_tuple new_feat1 = np.concatenate((feat1, feat2), axis=0) new_feat2 = np.concatenate((feat2, feat1), axis=0) new_labels = np.concatenate((labels, labels), axis=0) new_dataset = NumpyTupleDataset(new_feat1, new_feat2, new_labels) return new_dataset
def augment_dataset(dataset): dataset_tuple = dataset.get_datasets() atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple new_atoms1 = np.concatenate((atoms1, atoms2), axis=0) new_atoms2 = np.concatenate((atoms2, atoms1), axis=0) new_adjs1 = np.concatenate((adjs1, adjs2), axis=0) new_adjs2 = np.concatenate((adjs2, adjs1), axis=0) new_labels = np.concatenate((labels, labels), axis=0) new_dataset = NumpyTupleDataset(new_atoms1, new_adjs1, new_atoms2, new_adjs2, new_labels) return new_dataset
def main(): """Launcher.""" preprocessor = preprocess_method_dict["nfp"]() dataset = datasets.get_qm9(preprocessor, labels="h**o") cache_dir = "data/" if not (os.path.exists(cache_dir)): os.makedirs(cache_dir) NumpyTupleDataset.save(cache_dir + "data.npz", dataset) dataset = NumpyTupleDataset.load(cache_dir + 'data.npz') train_data_ratio = 0.7 train_data_size = int(len(dataset) * train_data_ratio) train, validation = split_dataset_random(dataset, train_data_size, 777) print('train dataset size:', len(train)) print('validation dataset size:', len(validation)) n_unit = 16 conv_layers = 4 model = GraphConvPredictor(NFP(n_unit, n_unit, conv_layers), MLP(n_unit, 1))
def test_get_item_list_index(self, long_data, index): dataset = NumpyTupleDataset(*long_data) actual = dataset[index] batches = [d[index] for d in long_data] length = len(batches[0]) expect = [tuple([batch[i] for batch in batches]) for i in six.moves.range(length)] assert len(actual) == len(expect) for tuple_a, tuple_e in six.moves.zip(actual, expect): assert len(tuple_a) == len(tuple_e) for a, e in six.moves.zip(tuple_a, tuple_e): numpy.testing.assert_array_equal(a, e)
def add_super_nodes(dataset, smiles_pairs): dataset_tuple = dataset.get_datasets() atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple smiles1 = smiles_pairs[:, 0] smiles2 = smiles_pairs[:, 1] pickle_filename = 'smiles2mol_vec.pkl' pickle_filepath = os.path.join(SUPER_NODE_PATH, pickle_filename) with open(pickle_filepath, 'r') as reader: smiles2atom_feat = pickle.load(reader) atom_feat1 = [smiles2atom_feat[smiles] for smiles in smiles1] atom_feat1 = np.array(atom_feat1, dtype=np.float32) atom_feat2 = [smiles2atom_feat[smiles] for smiles in smiles2] atom_feat2 = np.array(atom_feat2, dtype=np.float32) new_dataset = NumpyTupleDataset(atoms1, adjs1, atom_feat1, atoms2, adjs2, atom_feat2, labels) return new_dataset
def to_multi_hot_labels(dataset, enc=None): dataset_tuple = dataset.get_datasets() atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple label_list = list() for label in labels: try: label = [int(label)] except ValueError: label_in_str = str(label[0]) assert isinstance(label_in_str, str) label = map(int, label_in_str.split('||')) label_list.append(label) if enc is None: enc = MultiLabelBinarizer() transformed_labels = enc.fit_transform(label_list) else: transformed_labels = enc.transform(label_list) # enc_filename = 'multi_hot_enc.pkl' # enc_filepath = os.path.join(KAIST_PATH, enc_filename) # # if not os.path.exists(enc_filepath): # if not is_training: # raise ValueError('The multi-hot encoder file does not exist.') # else: # enc.fit(label_list) # joblib.dump(enc, enc_filepath) # transformed_labels = enc.transform(label_list) # # else: # enc = joblib.load(enc_filepath) # transformed_labels = enc.transform(label_list) new_dataset = NumpyTupleDataset( atoms1, adjs1, atoms2, adjs2, transformed_labels ) return new_dataset, enc
def to_one_hot_labels(dataset, is_training=True): dataset_tuple = dataset.get_datasets() atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple onehot_enc = OneHotEncoder(handle_unknown='ignore') onehot_filename = 'onehot_enc.pkl' onehot_filepath = os.path.join(KAIST_PATH, onehot_filename) if not os.path.exists(onehot_filepath): if not is_training: raise ValueError('The one-hot encoder file does not exist.') else: onehot_enc.fit(labels) joblib.dump(onehot_enc, onehot_filepath) transformed_labels = onehot_enc.transform(labels).toarray().astype(np.int32) else: onehot_enc = joblib.load(onehot_filepath) transformed_labels = onehot_enc.transform(labels).toarray().astype(np.int32) new_dataset = NumpyTupleDataset( atoms1, adjs1, atoms2, adjs2, transformed_labels ) return new_dataset
def main(): # Parse the arguments. args = parse_arguments() device = args.gpu # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=device) scaler = regressor.predictor.scaler if scaler is not None: original_t = dataset.get_datasets()[-1] if args.gpu >= 0: scaled_t = cuda.to_cpu(scaler.transform( cuda.to_gpu(original_t))) else: scaled_t = scaler.transform(original_t) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict( test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=device)[-1] original_t = cuda.to_cpu(scaler.inverse_transform(t)) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}' .format(label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=device)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': print('Fit StandardScaler to the labels.') scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training...') run_train(regressor, train, valid=valid, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=concat_mols, resume_path=None) # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,))) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) # Set up the regressor. metrics_fun = {'mean_abs_error': MeanAbsError(scaler=scaler), 'root_mean_sqr_error': RootMeanSqrError(scaler=scaler)} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/mean_abs_error', 'main/root_mean_sqr_error', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol) # Save the standard scaler's parameters. if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=args.protocol)
def test_len(self, data): dataset = NumpyTupleDataset(*data) assert len(dataset) == 2
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Dataset preparation dataset = None if os.path.exists(cache_dir): print('load from cache {}'.format(cache_dir)) dataset = NumpyTupleDataset.load(os.path.join(cache_dir, 'data.npz')) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) os.makedirs(cache_dir) NumpyTupleDataset.save(os.path.join(cache_dir, 'data.npz'), dataset) if args.scale == 'standardize': # Standard Scaler for labels ss = StandardScaler() labels = ss.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*dataset.get_datasets()[:-1], labels) train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = I.SerialIterator(train, args.batchsize) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) def scaled_abs_error(x0, x1): if isinstance(x0, Variable): x0 = cuda.to_cpu(x0.data) if isinstance(x1, Variable): x1 = cuda.to_cpu(x1.data) if args.scale == 'standardize': scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0)) scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1)) diff = scaled_x0 - scaled_x1 elif args.scale == 'none': diff = cuda.to_cpu(x0) - cuda.to_cpu(x1) return numpy.mean(numpy.absolute(diff), axis=0)[0] classifier = L.Classifier(model, lossfun=F.mean_squared_error, accfun=scaled_abs_error) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() classifier.to_gpu() optimizer = O.Adam() optimizer.setup(classifier) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run()
def main(input_args=None): # Parse the arguments. args = parse_arguments(input_args) device = args.gpu method = args.method if args.data_name == 'suzuki': datafile = 'data/suzuki_type_test_v2.csv' class_num = 119 class_dict = {'M': 28, 'L': 23, 'B': 35, 'S': 10, 'A': 17} dataset_filename = 'test_data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'CN': datafile = 'data/CN_coupling_test.csv' class_num = 206 class_dict = {'M': 44, 'L': 47, 'B': 13, 'S': 22, 'A': 74} dataset_filename = 'test_CN_data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'Negishi': datafile = 'data/Negishi_test.csv' class_num = 106 class_dict = {'M': 32, 'L': 20, 'T': 8, 'S': 10, 'A': 30} dataset_filename = 'test_Negishi_data.npz' labels = ['Yield', 'M', 'L', 'T', 'S', 'A', 'id'] elif args.data_name == 'PKR': datafile = 'data/PKR_test.csv' class_num = 83 class_dict = { 'M': 18, 'L': 6, 'T': 7, 'S': 15, 'A': 11, 'G': 1, 'O': 13, 'P': 4, 'other': 1 } dataset_filename = 'test_PKR_data.npz' labels = [ 'Yield', 'M', 'L', 'T', 'S', 'A', 'G', 'O', 'P', 'other', 'id' ] else: raise ValueError('Unexpected dataset name') cache_dir = os.path.join('input', '{}_all'.format(method)) # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: if args.method == 'mpnn': preprocessor = preprocess_method_dict['ggnn']() else: preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser( preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col=['Reactant1', 'Reactant2', 'Product'], label_dicts=class_dict) dataset = parser.parse(datafile)['dataset'] # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) labels = dataset.get_datasets()[-2] ids = dataset.get_datasets()[-1][:, 1].reshape(-1, 1) yields = dataset.get_datasets()[-1][:, 0].reshape(-1, 1).astype( 'float32') # [:,0] added dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-2] + ( yields, labels, ))) # Load the standard scaler parameters, if necessary. scaler = None test = dataset print('Predicting...') # Set up the regressor. model_path = os.path.join(args.in_dir, args.model_filename) if os.path.exists(model_path): classifier = Classifier.load_pickle(model_path, device=args.gpu) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) if args.load_modelname: serializers.load_npz(args.load_modelname, classifier) scaled_predictor = ScaledGraphConvPredictor( graph_conv=classifier.predictor.graph_conv, mlp=classifier.predictor.mlp) classifier.predictor = scaled_predictor # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] # Predict the output labels. # Prediction function rewrite!!! y_pred = classifier.predict(test, converter=extract_inputs) y_pred_max = numpy.argmax(y_pred, axis=1) y_pred_max = y_pred_max.reshape(-1, 1) # y_pred_idx = y_pred.argsort(axis=1) # ascending # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] # device 11/14 memory issue original_t = cuda.to_cpu(t) t_idx = original_t.squeeze(1) t_idx = t_idx.argsort(axis=1) # gt_indx = numpy.where(original_t == 1) # Construct dataframe. df_dict = {} for i, l in enumerate(labels[:1]): df_dict.update({ 'y_pred_{}'.format(l): y_pred_max[:, -1].tolist(), # [:,-1] 't_{}'.format(l): t_idx[:, -1].tolist(), }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred_max.shape[1]): label_name = labels[:1][0][target_label] print('label_name = {}, y_pred = {}, t = {}'.format( label_name, y_pred_max[:n_eval, target_label], t_idx[:n_eval, -1])) # Perform the prediction. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, classifier, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f) res_dic = {} for i in range(len(y_pred)): res_dic[i] = str(ids[i]) json.dump(res_dic, open(os.path.join(args.in_dir, "test_ids.json"), "w")) pickle.dump(y_pred, open(os.path.join(args.in_dir, "pred.pkl"), "wb")) pickle.dump(original_t, open(os.path.join(args.in_dir, "gt.pkl"), "wb"))
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) scaler = regressor.predictor.scaler if scaler is not None: scaled_t = scaler.transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t, ))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] original_t = scaler.inverse_transform(t) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format( label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def train(): parser = argparser.get_parser() args = parser.parse_args() device = -1 if args.gpu >= 0: device = args.gpu debug = args.debug print('input args:\n', json.dumps(vars(args), indent=4, separators=(',', ':'))) # pretty print args if args.data_name == 'qm9': from data import transform_qm9 transform_fn = transform_qm9.transform_fn atomic_num_list = [6, 7, 8, 9, 0] mlp_channels = [256, 256] gnn_channels = {'gcn': [8, 64], 'hidden': [128, 64]} valid_idx = transform_qm9.get_val_ids() elif args.data_name == 'zinc250k': transform_fn = transform_fn_zinc250k atomic_num_list = zinc250_atomic_num_list mlp_channels = [1024, 512] gnn_channels = {'gcn': [16, 128], 'hidden': [256, 64]} valid_idx = transform_zinc250k.get_val_ids() dataset = NumpyTupleDataset.load( os.path.join(args.data_dir, args.data_file)) dataset = TransformDataset(dataset, transform_fn) if len(valid_idx) > 0: train_idx = [t for t in range(len(dataset)) if t not in valid_idx] n_train = len(train_idx) train_idx.extend(valid_idx) train, test = chainer.datasets.split_dataset(dataset, n_train, train_idx) else: train, test = chainer.datasets.split_dataset_random( dataset, int(len(dataset) * 0.8), seed=args.seed) train_iter = chainer.iterators.SerialIterator(train, args.batch_size) num_masks = { 'node': args.num_node_masks, 'channel': args.num_channel_masks } mask_size = { 'node': args.node_mask_size, 'channel': args.channel_mask_size } num_coupling = { 'node': args.num_node_coupling, 'channel': args.num_channel_coupling } model_params = Hyperparameters( args.num_atoms, args.num_rels, len(atomic_num_list), num_masks=num_masks, mask_size=mask_size, num_coupling=num_coupling, batch_norm=args.apply_batch_norm, additive_transformations=args.additive_transformations, learn_dist=args.learn_dist, mlp_channels=mlp_channels, gnn_channels=gnn_channels) model = GraphNvpModel(model_params) if device >= 0: chainer.cuda.get_device(device).use() model.to_gpu(device) print('==========================================') if device >= 0: print('Using GPUs') print('Num Minibatch-size: {}'.format(args.batch_size)) print('Num epoch: {}'.format(args.max_epochs)) print('==========================================') os.makedirs(args.save_dir, exist_ok=True) model.save_hyperparams(os.path.join(args.save_dir, 'graphnvp-params.json')) opt = chainer.optimizers.Adam() opt.setup(model) updater = MolNvpUpdater(train_iter, opt, device=device, loss_func=None) trainer = training.Trainer(updater, (args.max_epochs, 'epoch'), out=args.save_dir) # trainer.extend(extensions.dump_graph('log_likelihood')) def print_validity(t): adj, x = generate_mols(model, batch_size=100, gpu=device) valid_mols = check_validity(adj, x, atomic_num_list, device)['valid_mols'] mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.epoch)) # mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.iteration)) os.makedirs(mol_dir, exist_ok=True) for ind, mol in enumerate(valid_mols): save_mol_png(mol, os.path.join(mol_dir, '{}.png'.format(ind))) if debug: # trainer.extend(print_validity, trigger=(1, 'epoch')) trainer.extend(print_validity, trigger=(100, 'iteration')) save_epochs = args.save_epochs if save_epochs == -1: save_epochs = args.max_epochs trainer.extend(extensions.snapshot(), trigger=(save_epochs, 'epoch')) # trainer.extend(extensions.PlotReport(['log_likelihood'], 'epoch', file_name='qm9.png'), # trigger=(100, 'iteration')) trainer.extend( extensions.PrintReport( ['epoch', 'log_likelihood', 'nll_x', 'nll_adj', 'elapsed_time'])) trainer.extend(extensions.LogReport()) trainer.extend(extensions.ProgressBar()) if args.load_params == 1: chainer.serializers.load_npz(args.load_snapshot, trainer) trainer.run() chainer.serializers.save_npz( os.path.join(args.save_dir, 'graph-nvp-final.npz'), model)
def test_invalid_datasets(self): a = numpy.array([1, 2]) b = numpy.array([1, 2, 3]) with pytest.raises(ValueError): NumpyTupleDataset(a, b)
def modify_dataset_for_hinge(dataset): atoms1, adjs1, atoms2, adjs2, labels = dataset.get_datasets() labels_squeezed = np.squeeze(labels, axis=1) new_dataset = NumpyTupleDataset(atoms1, adjs1, atoms2, adjs2, labels_squeezed) return new_dataset
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers task_type = molnet_default_config[dataset_name]['task_type'] model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [ dataset_part_filename(p, num_data) for p in ['train', 'valid'] ] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) train, valid = dataset_parts[0], dataset_parts[1] # Scale the label values, if necessary. scaler = None if args.scale == 'standardize': if task_type == 'regression': print('Applying standard scaling to the labels.') scaler = fit_scaler(dataset_parts) else: print('Label scaling is not available for classification tasks.') else: print('No label scaling was selected.') # Set up the predictor. predictor = set_up_predictor(method, n_unit, conv_layers, class_num, label_scaler=scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = { k: v for k, v in metrics.items() if isinstance(v, types.FunctionType) } loss_fun = molnet_default_config[dataset_name]['loss'] device = chainer.get_device(args.device) if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(model) # Save model-related output to this directory. model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend( E.Evaluator(valid_iter, model, device=device, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # TODO: consider go/no-go of the following block # # (i) more reporting for val/evalutaion # # (ii) best validation score snapshot # if task_type == 'regression': # metric_name_list = list(metrics.keys()) # if 'RMSE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/RMSE')) # elif 'MAE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/MAE')) # else: # print("[WARNING] No validation metric defined?") # # elif task_type == 'classification': # train_eval_iter = iterators.SerialIterator( # train, args.batchsize, repeat=False, shuffle=False) # trainer.extend(ROCAUCEvaluator( # train_eval_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(ROCAUCEvaluator( # valid_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # trainer.extend(E.snapshot_object( # model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) # else: # raise NotImplementedError( # 'Not implemented task_type = {}'.format(task_type)) trainer.extend(AutoPrintReport()) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': print('Applying standard scaling to the labels.') scaler = StandardScaler() scaled_t = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Set up the regressor. device = args.gpu metrics_fun = {'mae': MeanAbsError(scaler=scaler), 'rmse': RootMeanSqrError(scaler=scaler)} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.Evaluator(valid_iter, regressor, device=device, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport([ 'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss', 'validation/main/mae', 'validation/main/rmse', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--in-dir', '-i', type=str, default='result') parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) # class_num = len(labels) if isinstance(labels, list) else 1 else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # class_num = len(labels) # Dataset preparation dataset = None num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' dataset_cache_path = os.path.join(cache_dir, dataset_filename) if os.path.exists(dataset_cache_path): print('load from cache {}'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) if args.scale == 'standardize': # Standard Scaler for labels with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f: ss = pickle.load(f) else: ss = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) regressor = Regressor.load_pickle(os.path.join(args.in_dir, args.model_filename), device=args.gpu) # type: Regressor # We need to feed only input features `x` to `predict`/`predict_proba`. # This converter extracts only inputs (x1, x2, ...) from the features which # consist of input `x` and label `t` (x1, x2, ..., t). def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if ss is not None: # Model's output is scaled by StandardScaler, # so we need to rescale back. if isinstance(x, Variable): x = x.data scaled_x = ss.inverse_transform(cuda.to_cpu(x)) return scaled_x else: return x print('Predicting...') y_pred = regressor.predict(val, converter=extract_inputs, postprocess_fn=postprocess_fn) print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format( y_pred.shape, y_pred[:5, 0])) t = concat_mols(val, device=-1)[-1] n_eval = 10 # Construct dataframe df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show random 5 example's prediction/ground truth table print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # --- evaluate --- # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator` print('Evaluating...') val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False) eval_result = Evaluator(val_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result)
if data_name == 'qm9': max_atoms = 9 elif data_name == 'zinc250k': max_atoms = 38 else: raise ValueError("[ERROR] Unexpected value data_name={}".format(data_name)) if data_type == 'gcn': preprocessor = RSGCNPreprocessor(out_size=max_atoms) elif data_type == 'relgcn': # preprocessor = GGNNPreprocessor(out_size=max_atoms, kekulize=True, return_is_real_node=False) preprocessor = GGNNPreprocessor(out_size=max_atoms, kekulize=True) else: raise ValueError("[ERROR] Unexpected value data_type={}".format(data_type)) data_dir = "." os.makedirs(data_dir, exist_ok=True) if data_name == 'qm9': dataset = datasets.get_qm9(preprocessor) elif data_name == 'zinc250k': dataset = datasets.get_zinc250k(preprocessor) else: raise ValueError("[ERROR] Unexpected value data_name={}".format(data_name)) NumpyTupleDataset.save( os.path.join(data_dir, '{}_{}_kekulized_ggnp.npz'.format(data_name, data_type)), dataset)
def dataset(): a = numpy.random.random((10, 10)) b = numpy.random.random((10, 8)) c = numpy.random.random((10, 1)) return NumpyTupleDataset(a, b, c)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Load the standard scaler parameters, if necessary. if args.scale == 'standardize': scaler_path = os.path.join(args.in_dir, 'scaler.pkl') print('Loading scaler parameters from {}.'.format(scaler_path)) with open(scaler_path, mode='rb') as f: scaler = pickle.load(f) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) # Replace the default predictor with one that scales the output labels. scaled_predictor = ScaledGraphConvPredictor(regressor.predictor) scaled_predictor.scaler = scaler regressor.predictor = scaled_predictor # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] n_eval = 10 # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() # Prevents the loss function from becoming a cupy.core.core.ndarray object # when using the GPU. This hack will be removed as soon as the cause of # the issue is found and properly fixed. loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss'])) eval_result['main/loss'] = loss print('Evaluation result: ', eval_result) # Save the evaluation results. with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) # Load the cached dataset. filename = dataset_part_filename('test', num_data) path = os.path.join(cache_dir, filename) if os.path.exists(path): print('Loading cached dataset from {}.'.format(path)) test = NumpyTupleDataset.load(path) else: _, _, test = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) # Model-related data is stored this directory. model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir)) model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } task_type = molnet_default_config[dataset_name]['task_type'] model_path = os.path.join(model_dir, model_filename[task_type]) print("model_path=" + model_path) print('Loading model weights from {}...'.format(model_path)) if task_type == 'classification': model = Classifier.load_pickle(model_path, device=args.gpu) elif task_type == 'regression': model = Regressor.load_pickle(model_path, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Re-load the best-validation score snapshot # serializers.load_npz(os.path.join( # model_dir, "best_val_" + model_filename[task_type]), model) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, model, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Add more stats if task_type == 'regression': # loss = cuda.to_cpu(numpy.array(eval_result['main/loss'])) # eval_result['main/loss'] = loss # convert to native values.. for k, v in eval_result.items(): eval_result[k] = float(v) elif task_type == "classification": # For Classifier, we do not equip the model with ROC-AUC evalation function # use a seperate ROC-AUC Evaluator here rocauc_result = ROCAUCEvaluator(test_iterator, model, converter=concat_mols, device=args.gpu, eval_func=model.predictor, name='test', ignore_labels=-1)() print('ROCAUC Evaluation result: ', rocauc_result) save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result) else: print('[WARNING] unknown task_type {}.'.format(task_type)) # Save the evaluation results. save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) # Set up the iterator. train_iter = SerialIterator(train, args.batchsize) # Set up the regressor. metrics_fun = { 'mean_abs_error': MeanAbsError(scaler=scaler), 'root_mean_sqr_error': RootMeanSqrError(scaler=scaler) } regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/mean_abs_error', 'main/root_mean_sqr_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol) # Save the standard scaler's parameters. if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=args.protocol)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser( description='Regression with own dataset.') parser.add_argument('--datafile', type=str, default='dataset.csv') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', nargs='+', default=['value1', 'value2'], help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--protocol', type=int, default=2) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: sys.exit("Error: No target label is specified.") # Dataset preparation # Postprocess is required for regression task def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)["dataset"] if args.scale == 'standardize': # Standard Scaler for labels scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) else: # Not use scaler scaler = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = iterators.SerialIterator(train, args.batchsize) val_iter = iterators.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) regressor = Regressor( model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': ScaledAbsError(scaler=scaler)}, device=args.gpu) optimizer = optimizers.Adam() optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, regressor, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Note that original scale absolute errors are reported in # (validation/)main/abs_error trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- save regressor's parameters --- protocol = args.protocol model_path = os.path.join(args.out, 'model.npz') print('saving trained model to {}'.format(model_path)) serializers.save_npz(model_path, regressor) if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=protocol) # Example of prediction using trained model smiles = 'c1ccccc1' mol = Chem.MolFromSmiles(smiles) preprocessor = preprocess_method_dict[method]() standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol) input_features = preprocessor.get_input_features(mol) atoms, adjs = concat_mols([input_features], device=args.gpu) prediction = model(atoms, adjs).data[0] if scaler is not None: prediction = scaler.inverse_transform(prediction) print('Prediction for {}:'.format(smiles)) for i, label in enumerate(args.label): print('{}: {}'.format(label, prediction[i]))
def main(method, labels, unit_num, conv_layers, class_num, n_layers, dropout_ratio, model_path, save_path): # Dataset preparation train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset(method, labels) # --- model preparation --- model = predictor.build_predictor( method, unit_num, conv_layers, class_num, dropout_ratio, n_layers) classifier = L.Classifier(model, lossfun=F.sigmoid_cross_entropy, accfun=F.binary_accuracy) print('Loading model parameter from ', model_path) serializers.load_npz(model_path, model) target_dataset = val target_smiles = val_smiles val_mols = [Chem.MolFromSmiles(smi) for smi in tqdm(val_smiles)] pyridine_mol = Chem.MolFromSmarts(PYRIDINE_SMILES) pyridine_index = np.where(np.array([mol.HasSubstructMatch(pyridine_mol) for mol in val_mols]) == True) val_pyridine_mols = np.array(val_mols)[pyridine_index] # It only extracts one substructure, not expected behavior # val_pyridine_pos = [set(mol.GetSubstructMatch(pi)) for mol in val_pyridine_mols] def flatten_tuple(x): return [element for tupl in x for element in tupl] val_pyridine_pos = [flatten_tuple(mol.GetSubstructMatches(pyridine_mol)) for mol in val_pyridine_mols] # print('pyridine_index', pyridine_index) # print('val_pyridine_mols', val_pyridine_mols.shape) # print('val_pyridine_pos', val_pyridine_pos) # print('val_pyridine_pos length', [len(k) for k in val_pyridine_pos]) pyrigine_dataset = NumpyTupleDataset(*target_dataset.features[pyridine_index, :]) pyrigine_smiles = target_smiles[pyridine_index] print('pyrigine_dataset', len(pyrigine_dataset), len(pyrigine_smiles)) atoms = pyrigine_dataset.features[:, 0] num_atoms = [len(a) for a in atoms] def clip_original_size(saliency, num_atoms): """`saliency` array is 0 padded, this method align to have original molecule's length """ assert len(saliency) == len(num_atoms) saliency_list = [] for i in range(len(saliency)): saliency_list.append(saliency[i, :num_atoms[i]]) return saliency_list def preprocess_fun(*inputs): atom, adj, t = inputs # HACKING for now... atom_embed = classifier.predictor.graph_conv.embed(atom) return atom_embed, adj, t def eval_fun(*inputs): atom_embed, adj, t = inputs prob = classifier.predictor(atom_embed, adj) out = F.sum(prob) return out calculator_method = args.calculator print('calculator method', calculator_method) if calculator_method == 'gradient': # option1: Gradient calculator = GradientCalculator( classifier, eval_fun=eval_fun, # target_key='embed', eval_key='out', target_key=0, # multiply_target=True # this will calculate grad * input ) elif calculator_method == 'integrated_gradients': # option2: IntegratedGradients calculator = IntegratedGradientsCalculator( classifier, eval_fun=eval_fun, # target_key='embed', eval_key='out', target_key=0, steps=10 ) elif calculator_method == 'occlusion': # option3: Occlusion def eval_fun_occlusion(*inputs): atom_embed, adj, t = inputs prob = classifier.predictor(atom_embed, adj) # Do not take sum, instead return batch-wise score out = F.sigmoid(prob) return out calculator = OcclusionCalculator( classifier, eval_fun=eval_fun_occlusion, # target_key='embed', eval_key='out', target_key=0, slide_axis=1 ) else: raise ValueError("[ERROR] Unexpected value calculator_method={}".format(calculator_method)) M = 100 num = 20 rates = np.linspace(0.1, 1, num=num) print('M', M) # --- VanillaGrad --- saliency_arrays = calculator.compute_vanilla( pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun) saliency = calculator.transform( saliency_arrays, ch_axis=3, method='square') # saliency_arrays -> M, batch_size, max_atom, ch_dim # print('saliency_arrays', saliency_arrays.shape) # saliency -> batch_size, max_atom # print('saliency', saliency.shape) saliency_vanilla = clip_original_size(saliency, num_atoms) # recall & precision vanilla_recall, vanilla_precision = calc_recall_precision(saliency_vanilla, rates, val_pyridine_pos) print('vanilla_recall', vanilla_recall) print('vanilla_precision', vanilla_precision) # --- SmoothGrad --- saliency_arrays = calculator.compute_smooth( pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun, M=M, mode='absolute', scale=0.15 # previous implementation # mode='relative', scale=0.05 ) saliency = calculator.transform( saliency_arrays, ch_axis=3, method='square') saliency_smooth = clip_original_size(saliency, num_atoms) # recall & precision smooth_recall, smooth_precision = calc_recall_precision(saliency_smooth, rates, val_pyridine_pos) print('smooth_recall', smooth_recall) print('smooth_precision', smooth_precision) # --- BayesGrad --- # bayes grad is calculated by compute_vanilla with train=True saliency_arrays = calculator.compute_vanilla( pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun, M=M, train=True) saliency = calculator.transform( saliency_arrays, ch_axis=3, method='square', lam=0) saliency_bayes = clip_original_size(saliency, num_atoms) bayes_recall, bayes_precision = calc_recall_precision(saliency_bayes, rates, val_pyridine_pos) print('bayes_recall', bayes_recall) print('bayes_precision', bayes_precision) plt.figure(figsize=(7, 5), dpi=200) plt.plot(vanilla_recall, vanilla_precision, 'k-', color='blue', label='VanillaGrad') plt.plot(smooth_recall, smooth_precision, 'k-', color='green', label='SmoothGrad') plt.plot(bayes_recall, bayes_precision, 'k-', color='red', label='BayesGrad(Ours)') plt.axhline(y=vanilla_precision[-1], color='gray', linestyle='--') plt.legend() plt.xlabel("recall") plt.ylabel("precision") if save_path: print('saved to ', save_path) plt.savefig(save_path) # plt.savefig('artificial_pr.eps') else: plt.show()
def test_get_datasets(self, data): dataset = NumpyTupleDataset(*data) datasets = dataset.get_datasets() assert len(datasets) == len(data) for i in range(len(datasets)): numpy.testing.assert_array_equal(datasets[i], data[i])
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers task_type = molnet_default_config[dataset_name]['task_type'] model_filename = {'classification': 'classifier.pkl', 'regression': 'regressor.pkl'} print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [dataset_part_filename(p, num_data) for p in ['train', 'valid']] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) train, valid = dataset_parts[0], dataset_parts[1] # # Scale the label values, if necessary. # if args.scale == 'standardize': # if task_type == 'regression': # print('Applying standard scaling to the labels.') # datasets, scaler = standardize_dataset_labels(datasets) # else: # print('Label scaling is not available for classification tasks.') # else: # print('No label scaling was selected.') # scaler = None # Set up the predictor. predictor = set_up_predictor(method, n_unit, conv_layers, class_num) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = {k: v for k, v in metrics.items() if isinstance(v, types.FunctionType)} loss_fun = molnet_default_config[dataset_name]['loss'] if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) # TODO: Use standard scaler for regression task elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(model) # Save model-related output to this directory. model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend(E.Evaluator(valid_iter, model, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Report various metrics. print_report_targets = ['epoch', 'main/loss', 'validation/main/loss'] for metric_name, metric_fun in metrics.items(): if isinstance(metric_fun, types.FunctionType): print_report_targets.append('main/' + metric_name) print_report_targets.append('validation/main/' + metric_name) elif issubclass(metric_fun, BatchEvaluator): trainer.extend(metric_fun(valid_iter, model, device=args.gpu, eval_func=predictor, converter=concat_mols, name='val', raise_value_error=False)) print_report_targets.append('val/main/' + metric_name) else: raise TypeError('{} is not a supported metrics function.' .format(type(metrics_fun))) print_report_targets.append('elapsed_time') # Augmented by Ishiguro # ToDo: consider go/no-go of the following block # (i) more reporting for val/evalutaion # (ii) best validation score snapshot if task_type == 'regression': if 'RMSE' in metric_name: trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger('validation/main/RMSE')) elif 'MAE' in metric_name: trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger('validation/main/MAE')) else: print("No validation metric defined?") assert(False) elif task_type == 'classification': train_eval_iter = iterators.SerialIterator(train, args.batchsize,repeat=False, shuffle=False) trainer.extend(ROCAUCEvaluator( train_eval_iter, predictor, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend(ROCAUCEvaluator( valid_iter, predictor, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val', pos_labels=1, ignore_labels=-1)) print_report_targets.append('train/main/roc_auc') print_report_targets.append('validation/main/loss') print_report_targets.append('val/main/roc_auc') trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) else: raise NotImplementedError( 'Not implemented task_type = {}'.format(task_type)) trainer.extend(E.PrintReport(print_report_targets)) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol)