def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Load the standard scaler parameters, if necessary.
    if args.scale == 'standardize':
        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
        print('Loading scaler parameters from {}.'.format(scaler_path))
        with open(scaler_path, mode='rb') as f:
            scaler = pickle.load(f)
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)

    # Replace the default predictor with one that scales the output labels.
    scaled_predictor = ScaledGraphConvPredictor(regressor.predictor)
    scaled_predictor.scaler = scaler
    regressor.predictor = scaled_predictor

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test, converter=extract_inputs)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    n_eval = 10

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))
    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()

    # Prevents the loss function from becoming a cupy.core.core.ndarray object
    # when using the GPU. This hack will be removed as soon as the cause of
    # the issue is found and properly fixed.
    loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss']))
    eval_result['main/loss'] = loss
    print('Evaluation result: ', eval_result)

    # Save the evaluation results.
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)
Ejemplo n.º 2
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        scaled_t = scaler.transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t, )))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    original_t = scaler.inverse_transform(t)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): original_t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format(
            label_name, y_pred[:n_eval, target_label],
            original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
Ejemplo n.º 3
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()

        if num_data >= 0:
            # Select the first `num_data` samples from the dataset.
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor, labels=labels,
                                target_index=target_index)
        else:
            # Load the entire dataset.
            dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        print('Applying standard scaling to the labels.')
        scaler = StandardScaler()
        scaled_t = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1]
                                      + (scaled_t,)))
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, valid = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,
                                 class_num, scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,
                                          shuffle=False)

    # Set up the regressor.
    device = args.gpu
    metrics_fun = {'mae': MeanAbsError(scaler=scaler),
                   'rmse': RootMeanSqrError(scaler=scaler)}
    regressor = Regressor(predictor, lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun, device=device)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter, optimizer, device=device,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.Evaluator(valid_iter, regressor, device=device,
                               converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.PrintReport([
        'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss',
        'validation/main/mae', 'validation/main/rmse', 'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)
Ejemplo n.º 4
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()

        if num_data >= 0:
            # Select the first `num_data` samples from the dataset.
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor,
                                labels=labels,
                                target_index=target_index)
        else:
            # Load the entire dataset.
            dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        print('Fit StandardScaler to the labels.')
        scaler = StandardScaler()
        scaler.fit(dataset.get_datasets()[-1])
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, valid = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,
                                 class_num, scaler)

    # Set up the regressor.
    device = chainer.get_device(args.device)
    metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}
    regressor = Regressor(predictor,
                          lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun,
                          device=device)

    print('Training...')
    run_train(regressor,
              train,
              valid=valid,
              batch_size=args.batchsize,
              epoch=args.epoch,
              out=args.out,
              extensions_list=None,
              device=device,
              converter=concat_mols,
              resume_path=None)

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)
Ejemplo n.º 5
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--in-dir', '-i', type=str, default='result')
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        # class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))
        # class_num = len(labels)

    # Dataset preparation
    dataset = None

    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    dataset_cache_path = os.path.join(cache_dir, dataset_filename)
    if os.path.exists(dataset_cache_path):
        print('load from cache {}'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f:
            ss = pickle.load(f)
    else:
        ss = None

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    regressor = Regressor.load_pickle(os.path.join(args.in_dir,
                                                   args.model_filename),
                                      device=args.gpu)  # type: Regressor

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if ss is not None:
            # Model's output is scaled by StandardScaler,
            # so we need to rescale back.
            if isinstance(x, Variable):
                x = x.data
                scaled_x = ss.inverse_transform(cuda.to_cpu(x))
                return scaled_x
        else:
            return x

    print('Predicting...')
    y_pred = regressor.predict(val,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))

    t = concat_mols(val, device=-1)[-1]
    n_eval = 10

    # Construct dataframe
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show random 5 example's prediction/ground truth table
    print(df.sample(5))

    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(val_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
Ejemplo n.º 6
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Dataset preparation
    dataset = None

    if os.path.exists(cache_dir):
        print('load from cache {}'.format(cache_dir))
        dataset = NumpyTupleDataset.load(os.path.join(cache_dir, 'data.npz'))
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)
        os.makedirs(cache_dir)
        NumpyTupleDataset.save(os.path.join(cache_dir, 'data.npz'), dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        ss = StandardScaler()
        labels = ss.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*dataset.get_datasets()[:-1], labels)

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    def scaled_abs_error(x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if args.scale == 'standardize':
            scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
        elif args.scale == 'none':
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
        return numpy.mean(numpy.absolute(diff), axis=0)[0]

    classifier = L.Classifier(model,
                              lossfun=F.mean_squared_error,
                              accfun=scaled_abs_error)

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()
Ejemplo n.º 7
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--protocol', type=int, default=2)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Dataset preparation
    dataset = None

    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)
    if os.path.exists(dataset_cache_path):
        print('load from cache {}'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        if num_data >= 0:
            # only use first 100 for debug
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor,
                                labels=labels,
                                target_index=target_index)
        else:
            dataset = D.get_qm9(preprocessor, labels=labels)
        os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        ss = StandardScaler()
        labels = ss.fit_transform(dataset.get_datasets()[-1])
    else:
        ss = None
    dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, )))

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    regressor = Regressor(
        model,
        lossfun=F.mean_squared_error,
        metrics_fun={'abs_error': ScaledAbsError(scale=args.scale, ss=ss)},
        device=args.gpu)

    optimizer = O.Adam()
    optimizer.setup(regressor)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    regressor,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss',
            'validation/main/abs_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save regressor & standardscaler ---
    protocol = args.protocol
    regressor.save_pickle(os.path.join(args.out, args.model_filename),
                          protocol=protocol)
    if args.scale == 'standardize':
        with open(os.path.join(args.out, 'ss.pkl'), mode='wb') as f:
            pickle.dump(ss, f, protocol=protocol)
Ejemplo n.º 8
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    device = args.gpu

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=device)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        original_t = dataset.get_datasets()[-1]
        if args.gpu >= 0:
            scaled_t = cuda.to_cpu(scaler.transform(
                cuda.to_gpu(original_t)))
        else:
            scaled_t = scaler.transform(original_t)

        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t,)))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(
        test, converter=extract_inputs,
        postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=device)[-1]
    original_t = cuda.to_cpu(scaler.inverse_transform(t))

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],
                        't_{}'.format(l): original_t[:, i], })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'
              .format(label_name, y_pred[:n_eval, target_label],
                      original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator, regressor, converter=concat_mols,
                            device=device)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
Ejemplo n.º 9
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()

        if num_data >= 0:
            # Select the first `num_data` samples from the dataset.
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor, labels=labels,
                                target_index=target_index)
        else:
            # Load the entire dataset.
            dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        print('Applying standard scaling to the labels.')
        scaler = StandardScaler()
        scaled_t = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1]
                                      + (scaled_t,)))
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, valid = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,
                                 class_num, scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,
                                          shuffle=False)

    # Set up the regressor.
    device = args.gpu
    metrics_fun = {'mae': MeanAbsError(scaler=scaler),
                   'rmse': RootMeanSqrError(scaler=scaler)}
    regressor = Regressor(predictor, lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun, device=device)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter, optimizer, device=device,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.Evaluator(valid_iter, regressor, device=device,
                               converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.PrintReport([
        'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss',
        'validation/main/mae', 'validation/main/rmse', 'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)