Esempio n. 1
0
def test_concat_mols_2d_gpu(data_2d, data_2d_expect):
    result = concat_mols(data_2d, device=0)
    assert chainer.cuda.get_device_from_array(result[0]).id == 0
    assert chainer.cuda.get_device_from_array(result[1]).id == 0
    assert numpy.array_equal(chainer.cuda.to_cpu(result[0]),
                             data_2d_expect[0])
    assert numpy.array_equal(chainer.cuda.to_cpu(result[1]),
                             data_2d_expect[1])
Esempio n. 2
0
 def eval_func(atoms_1, adj_1, atoms_2, adj_2):
     sample = [
         (atoms_1, adj_1),
         (atoms_2, adj_2),
     ]
     sample = concat_mols(sample)
     atoms_1, adj_1 = sample[0]
     atoms_2, adj_2 = sample[1]
     print(atoms_1, adj_1)
     print('shape 1:', atoms_1.shape, adj_1.shape)
     print('shape 2:', atoms_2.shape, adj_2.shape)
     pred, _ = model.predictor.predict(atoms_1, adj_1, atoms_2, adj_2)
     return pred
Esempio n. 3
0
    def predict(self, *args, batchsize=32, device=-1):
        if device >= 0:
            chainer.cuda.get_device_from_id(device).use()
            self.to_gpu()  # Copy the model to the GPU

        # TODO: Not test yet, check behavior
        data = args[0]
        y_list = []
        for i in range(0, len(data), batchsize):
            atoms, adjs = concat_mols(data[i:i + batchsize], device=device)[:2]
            y = self._predict(atoms, adjs)
            y_list.append(cuda.to_cpu(y.data))
        y_array = numpy.concatenate(y_list, axis=0)
        return y_array
Esempio n. 4
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        scaled_t = scaler.transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t, )))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    original_t = scaler.inverse_transform(t)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): original_t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format(
            label_name, y_pred[:n_eval, target_label],
            original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Load the standard scaler parameters, if necessary.
    if args.scale == 'standardize':
        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
        print('Loading scaler parameters from {}.'.format(scaler_path))
        with open(scaler_path, mode='rb') as f:
            scaler = pickle.load(f)
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)

    # Replace the default predictor with one that scales the output labels.
    scaled_predictor = ScaledGraphConvPredictor(regressor.predictor)
    scaled_predictor.scaler = scaler
    regressor.predictor = scaled_predictor

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test, converter=extract_inputs)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    n_eval = 10

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))
    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()

    # Prevents the loss function from becoming a cupy.core.core.ndarray object
    # when using the GPU. This hack will be removed as soon as the cause of
    # the issue is found and properly fixed.
    loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss']))
    eval_result['main/loss'] = loss
    print('Evaluation result: ', eval_result)

    # Save the evaluation results.
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)
                                   gwm=args.gwm)
    chainer.serializers.load_npz(args.g_action, g_action)

    # chainer.cuda.get_device_from_id(0).use()
    # g_stop.to_gpu()

    valid_raw = uspto_dataset.read_data(args.test_path)
    valid_dataset = uspto_dataset.USPTO_dataset(valid_raw)
    valid_iter = SerialIterator(valid_dataset, 20, repeat=False, shuffle=False)

    one_part_acc = []
    for batch in valid_iter:
        # get one batch of test data
        f_atoms, f_bonds, super_node_x, \
        atom_label, mask_reagents, mask_reactants_reagents, pair_label, mask_pair_select, \
        action, step_num, \
        stop_idx, \
        sample_index = concat_mols(batch, device=-1)

        atom_label -= 1
        mask_reagents -= 2
        mask_reactants_reagents -= 2
        action -= 1

        with chainer.using_config('train', False):
            inference(g_stop, g_atom, g_pair, g_action, f_atoms, f_bonds,
                      super_node_x, atom_label, mask_reagents,
                      mask_reactants_reagents, pair_label, mask_pair_select,
                      action, step_num, stop_idx, sample_index, valid_raw,
                      args.out)
Esempio n. 7
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(
        description='Regression with own dataset.')
    parser.add_argument('--datafile', type=str, default='dataset.csv')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        nargs='+',
                        default=['value1', 'value2'],
                        help='target label for regression')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--protocol', type=int, default=2)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        sys.exit("Error: No target label is specified.")

    # Dataset preparation
    # Postprocess is required for regression task
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES')
    dataset = parser.parse(args.datafile)["dataset"]

    if args.scale == 'standardize':
        # Standard Scaler for labels
        scaler = StandardScaler()
        labels = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (labels, )))
    else:
        # Not use scaler
        scaler = None

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val,
                                        args.batchsize,
                                        repeat=False,
                                        shuffle=False)

    regressor = Regressor(
        model,
        lossfun=F.mean_squared_error,
        metrics_fun={'abs_error': ScaledAbsError(scaler=scaler)},
        device=args.gpu)

    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    regressor,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    # Note that original scale absolute errors are reported in
    # (validation/)main/abs_error
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss',
            'validation/main/abs_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save regressor's parameters ---
    protocol = args.protocol
    model_path = os.path.join(args.out, 'model.npz')
    print('saving trained model to {}'.format(model_path))
    serializers.save_npz(model_path, regressor)
    if scaler is not None:
        with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f:
            pickle.dump(scaler, f, protocol=protocol)

    # Example of prediction using trained model
    smiles = 'c1ccccc1'
    mol = Chem.MolFromSmiles(smiles)
    preprocessor = preprocess_method_dict[method]()
    standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol)
    input_features = preprocessor.get_input_features(mol)
    atoms, adjs = concat_mols([input_features], device=args.gpu)
    prediction = model(atoms, adjs).data[0]
    if scaler is not None:
        prediction = scaler.inverse_transform(prediction)
    print('Prediction for {}:'.format(smiles))
    for i, label in enumerate(args.label):
        print('{}: {}'.format(label, prediction[i]))
Esempio n. 8
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--in-dir', '-i', type=str, default='result')
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        # class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))
        # class_num = len(labels)

    # Dataset preparation
    dataset = None

    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    dataset_cache_path = os.path.join(cache_dir, dataset_filename)
    if os.path.exists(dataset_cache_path):
        print('load from cache {}'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f:
            ss = pickle.load(f)
    else:
        ss = None

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    regressor = Regressor.load_pickle(os.path.join(args.in_dir,
                                                   args.model_filename),
                                      device=args.gpu)  # type: Regressor

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if ss is not None:
            # Model's output is scaled by StandardScaler,
            # so we need to rescale back.
            if isinstance(x, Variable):
                x = x.data
                scaled_x = ss.inverse_transform(cuda.to_cpu(x))
                return scaled_x
        else:
            return x

    print('Predicting...')
    y_pred = regressor.predict(val,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))

    t = concat_mols(val, device=-1)[-1]
    n_eval = 10

    # Construct dataframe
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show random 5 example's prediction/ground truth table
    print(df.sample(5))

    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(val_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
Esempio n. 9
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(
        description='Regression with own dataset.')
    parser.add_argument('datafile', type=str)
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        default='nfp')
    parser.add_argument('--label', '-l', nargs='+',
                        help='target label for regression')
    parser.add_argument('--scale', type=str, choices=scale_list,
                        default='standardize', help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        sys.exit("Error: No target label is specified.")

    # Dataset preparation
    # Postprocess is required for regression task
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels, smiles_col='SMILES')
    dataset = parser.parse(args.datafile)["dataset"]

    if args.scale == 'standardize':
        # Standard Scaler for labels
        ss = StandardScaler()
        labels = ss.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,)))

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(NFP(out_dim=n_unit, hidden_dim=n_unit,
                                       n_layers=conv_layers),
                                   MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(GGNN(out_dim=n_unit, hidden_dim=n_unit,
                                        n_layers=conv_layers),
                                   MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer, n_atom=n_atom),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val, args.batchsize,
                                repeat=False, shuffle=False)

    def scaled_abs_error(x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if args.scale == 'standardize':
            scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
        elif args.scale == 'none':
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
        return numpy.mean(numpy.absolute(diff), axis=0)[0]

    classifier = L.Classifier(model, lossfun=F.mean_squared_error,
                              accfun=scaled_abs_error)

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.Evaluator(val_iter, classifier, device=args.gpu,
                               converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    # Note that scaled errors are reported as (validation/)main/accuracy
    trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/accuracy',
                                  'validation/main/loss',
                                  'validation/main/accuracy',
                                  'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Example of prediction using trained model
    smiles = 'c1ccccc1'
    mol = Chem.MolFromSmiles(smiles)
    preprocessor = preprocess_method_dict[method]()
    standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol)
    input_features = preprocessor.get_input_features(mol)
    atoms, adjs = concat_mols([input_features], device=args.gpu)
    prediction = model(atoms, adjs).data[0]
    print('Prediction for {}:'.format(smiles))
    for i, label in enumerate(args.label):
        print('{}: {}'.format(label, prediction[i]))
Esempio n. 10
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    device = args.gpu

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=device)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        original_t = dataset.get_datasets()[-1]
        if args.gpu >= 0:
            scaled_t = cuda.to_cpu(scaler.transform(
                cuda.to_gpu(original_t)))
        else:
            scaled_t = scaler.transform(original_t)

        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t,)))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(
        test, converter=extract_inputs,
        postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=device)[-1]
    original_t = cuda.to_cpu(scaler.inverse_transform(t))

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],
                        't_{}'.format(l): original_t[:, i], })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'
              .format(label_name, y_pred[:n_eval, target_label],
                      original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator, regressor, converter=concat_mols,
                            device=device)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
Esempio n. 11
0
 def extract_inputs(batch, device=None):
     return concat_mols(batch, device=device)[:-1]
Esempio n. 12
0
def test_concat_mols_2d_cpu(data_2d, data_2d_expect):
    result = concat_mols(data_2d, device=-1)
    assert numpy.array_equal(result[0], data_2d_expect[0])
    assert numpy.array_equal(result[1], data_2d_expect[1])
Esempio n. 13
0
        h_s = self.embed_super(super_node)

        for step in range(self.n_update_layers):
            h_new = self.update_layers[step](h=h, adj=adj)
            h_new, h_s = self.gwm(h, h_new, h_s, step)
            h = h_new

        return h

    def reset_state(self):
        if hasattr(self.update_layers[0], 'reset_state'):
            [update_layer.reset_state() for update_layer in self.update_layers]

        self.gwm.reset_state()


if __name__ == '__main__':
    import uspto_pre
    from chainer.iterators import SerialIterator
    from chainer_chemistry.dataset.converters import concat_mols

    train_raw = uspto_pre.read_data('../train.txt.proc')
    train_dataset = uspto_pre.USPTO_pre(train_raw[:100], 'softmax')
    train_iter = SerialIterator(train_dataset, 3)

    model = ggnn_gwm()

    for b in train_iter:
        atom_feature, adjs, supernode_feature, label, ind = concat_mols(b, padding=-1)
        print(model(atom_feature, adjs, supernode_feature))
Esempio n. 14
0
 def converter(batch, device):
     return concat_mols(batch, device)[:-1]
Esempio n. 15
0
def train(gpu, method, epoch, batchsize, n_unit, conv_layers, dataset, smiles,
          M, n_split, split_idx, order):
    n = len(dataset)
    assert len(order) == n
    left_idx = (n // n_split) * split_idx
    is_right_most_split = (n_split == split_idx + 1)
    if is_right_most_split:
        test_order = order[left_idx:]
        train_order = order[:left_idx]
    else:
        right_idx = (n // n_split) * (split_idx + 1)
        test_order = order[left_idx:right_idx]
        train_order = np.concatenate([order[:left_idx], order[right_idx:]])

    new_order = np.concatenate([train_order, test_order])
    n_train = len(train_order)

    # Standard Scaler for labels
    ss = StandardScaler()
    labels = dataset.get_datasets()[-1]
    train_label = labels[new_order[:n_train]]
    ss = ss.fit(train_label)  # fit only by train
    labels = ss.transform(dataset.get_datasets()[-1])
    dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, )))

    dataset_train = SubDataset(dataset, 0, n_train, new_order)
    dataset_test = SubDataset(dataset, n_train, n, new_order)

    # Network
    model = predictor.build_predictor(method,
                                      n_unit,
                                      conv_layers,
                                      1,
                                      dropout_ratio=0.25,
                                      n_layers=1)

    train_iter = I.SerialIterator(dataset_train, batchsize)
    val_iter = I.SerialIterator(dataset_test,
                                batchsize,
                                repeat=False,
                                shuffle=False)

    def scaled_abs_error(x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0))
        scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1))
        diff = scaled_x0 - scaled_x1
        return np.mean(np.absolute(diff), axis=0)[0]

    regressor = Regressor(model,
                          lossfun=F.mean_squared_error,
                          metrics_fun={'abs_error': scaled_abs_error},
                          device=gpu)

    optimizer = O.Adam(alpha=0.0005)
    optimizer.setup(regressor)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=gpu,
                                       converter=concat_mols)

    dir_path = get_dir_path(batchsize, n_unit, conv_layers, M, method)
    dir_path = os.path.join(dir_path, str(split_idx) + "-" + str(n_split))
    os.makedirs(dir_path, exist_ok=True)
    print('creating ', dir_path)
    np.save(os.path.join(dir_path, "test_idx"), np.array(test_order))

    trainer = training.Trainer(updater, (epoch, 'epoch'), out=dir_path)
    trainer.extend(
        E.Evaluator(val_iter, regressor, device=gpu, converter=concat_mols))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss',
            'validation/main/abs_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- Plot regression evaluation result ---
    dataset_test = SubDataset(dataset, n_train, n, new_order)
    batch_all = concat_mols(dataset_test, device=gpu)
    serializers.save_npz(os.path.join(dir_path, "model.npz"), model)
    result = model(batch_all[0], batch_all[1])
    result = ss.inverse_transform(cuda.to_cpu(result.data))
    answer = ss.inverse_transform(cuda.to_cpu(batch_all[2]))
    plot_result(result,
                answer,
                save_filepath=os.path.join(dir_path, "result.png"))

    # --- Plot regression evaluation result end ---
    np.save(os.path.join(dir_path, "output.npy"), result)
    np.save(os.path.join(dir_path, "answer.npy"), answer)
    smiles_part = np.array(smiles)[test_order]
    np.save(os.path.join(dir_path, "smiles.npy"), smiles_part)

    # calculate saliency and save it.
    save_result(dataset, model, dir_path, M)
Esempio n. 16
0
def main(input_args=None):
    # Parse the arguments.
    args = parse_arguments(input_args)
    device = args.gpu
    method = args.method

    if args.data_name == 'suzuki':
        datafile = 'data/suzuki_type_test_v2.csv'
        class_num = 119
        class_dict = {'M': 28, 'L': 23, 'B': 35, 'S': 10, 'A': 17}
        dataset_filename = 'test_data.npz'
        labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id']
    elif args.data_name == 'CN':
        datafile = 'data/CN_coupling_test.csv'
        class_num = 206
        class_dict = {'M': 44, 'L': 47, 'B': 13, 'S': 22, 'A': 74}
        dataset_filename = 'test_CN_data.npz'
        labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id']
    elif args.data_name == 'Negishi':
        datafile = 'data/Negishi_test.csv'
        class_num = 106
        class_dict = {'M': 32, 'L': 20, 'T': 8, 'S': 10, 'A': 30}
        dataset_filename = 'test_Negishi_data.npz'
        labels = ['Yield', 'M', 'L', 'T', 'S', 'A', 'id']
    elif args.data_name == 'PKR':
        datafile = 'data/PKR_test.csv'
        class_num = 83
        class_dict = {
            'M': 18,
            'L': 6,
            'T': 7,
            'S': 15,
            'A': 11,
            'G': 1,
            'O': 13,
            'P': 4,
            'other': 1
        }
        dataset_filename = 'test_PKR_data.npz'
        labels = [
            'Yield', 'M', 'L', 'T', 'S', 'A', 'G', 'O', 'P', 'other', 'id'
        ]
    else:
        raise ValueError('Unexpected dataset name')

    cache_dir = os.path.join('input', '{}_all'.format(method))

    # Dataset preparation.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        if args.method == 'mpnn':
            preprocessor = preprocess_method_dict['ggnn']()
        else:
            preprocessor = preprocess_method_dict[args.method]()
        parser = CSVFileParser(
            preprocessor,
            postprocess_label=postprocess_label,
            labels=labels,
            smiles_col=['Reactant1', 'Reactant2', 'Product'],
            label_dicts=class_dict)
        dataset = parser.parse(datafile)['dataset']

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    labels = dataset.get_datasets()[-2]
    ids = dataset.get_datasets()[-1][:, 1].reshape(-1, 1)
    yields = dataset.get_datasets()[-1][:, 0].reshape(-1, 1).astype(
        'float32')  # [:,0] added
    dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-2] + (
        yields,
        labels,
    )))

    # Load the standard scaler parameters, if necessary.
    scaler = None
    test = dataset

    print('Predicting...')
    # Set up the regressor.
    model_path = os.path.join(args.in_dir, args.model_filename)

    if os.path.exists(model_path):
        classifier = Classifier.load_pickle(model_path, device=args.gpu)
    else:
        predictor = set_up_predictor(args.method, args.unit_num,
                                     args.conv_layers, class_num)
        classifier = Classifier(predictor,
                                lossfun=F.sigmoid_cross_entropy,
                                metrics_fun=F.binary_accuracy,
                                device=args.gpu)

    if args.load_modelname:
        serializers.load_npz(args.load_modelname, classifier)
    scaled_predictor = ScaledGraphConvPredictor(
        graph_conv=classifier.predictor.graph_conv,
        mlp=classifier.predictor.mlp)
    classifier.predictor = scaled_predictor

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    # Predict the output labels.
    # Prediction function rewrite!!!
    y_pred = classifier.predict(test, converter=extract_inputs)
    y_pred_max = numpy.argmax(y_pred, axis=1)
    y_pred_max = y_pred_max.reshape(-1, 1)
    # y_pred_idx = y_pred.argsort(axis=1) # ascending

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]  # device 11/14 memory issue
    original_t = cuda.to_cpu(t)
    t_idx = original_t.squeeze(1)
    t_idx = t_idx.argsort(axis=1)
    # gt_indx = numpy.where(original_t == 1)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels[:1]):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred_max[:, -1].tolist(),  # [:,-1]
            't_{}'.format(l): t_idx[:, -1].tolist(),
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10

    for target_label in range(y_pred_max.shape[1]):
        label_name = labels[:1][0][target_label]
        print('label_name = {}, y_pred = {}, t = {}'.format(
            label_name, y_pred_max[:n_eval, target_label], t_idx[:n_eval, -1]))

    # Perform the prediction.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            classifier,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)

    res_dic = {}
    for i in range(len(y_pred)):
        res_dic[i] = str(ids[i])
    json.dump(res_dic, open(os.path.join(args.in_dir, "test_ids.json"), "w"))

    pickle.dump(y_pred, open(os.path.join(args.in_dir, "pred.pkl"), "wb"))
    pickle.dump(original_t, open(os.path.join(args.in_dir, "gt.pkl"), "wb"))
Esempio n. 17
0
 def extract_inputs(batch, device=None):
     return concat_mols(batch, device=device)[:-1]
Esempio n. 18
0
def inference():
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--batch_size', type=int, default=16)
    # parser.add_argument('--epoch', type=int, default=10)
    # parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--gpu', action='store_true')
    parser.add_argument('--out', default='result_debug/')
    # parser.add_argument('--frequency', type=int, default=-1)
    # parser.add_argument('--decay_iter', type=int, default=40000)
    parser.add_argument('--gnn_dim', type=int, default=300)
    parser.add_argument('--n_layers', type=int, default=3)
    parser.add_argument('--nn_dim', type=int, default=100)
    # parser.add_argument('--train_path', default='../train.txt.proc')
    parser.add_argument('--valid_path', default='../test.txt.proc')
    parser.add_argument('--communicator', type=str, default='pure_nccl')
    parser.add_argument('--type', default='debug')
    parser.add_argument('--rich', type=strtobool, default='false')

    parser.add_argument('--snapshot')

    args = parser.parse_args()

    assert args.type in ['debug', 'all']

    if args.gpu:
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print(glob.glob(args.out + 'snapshot_*'))

    model = pair_matrix_model(gnn_dim=args.gnn_dim,
                              n_layers=args.n_layers,
                              nn_dim=args.nn_dim)
    chainer.serializers.load_npz(args.out + args.snapshot, model)
    if device > 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    valid_raw = uspto_pre.read_data(args.valid_path)
    if comm.rank == 0:
        if args.type == 'debug':
            valid_dataset = uspto_pre.USPTO_pre(valid_raw[:40], args.rich)
        elif args.type == 'all':
            valid_dataset = uspto_pre.USPTO_pre(valid_raw, args.rich)
    else:
        valid_dataset = None, None
    valid_dataset = chainermn.scatter_dataset(valid_dataset,
                                              comm,
                                              shuffle=False)
    valid_iter = SerialIterator(valid_dataset,
                                args.batch_size,
                                repeat=False,
                                shuffle=False)

    for batch in valid_iter:
        in_arrays = concat_mols(batch=batch, device=device, padding=-1)
        assert isinstance(in_arrays, tuple)

        with chainer.using_config('train', False):
            h = model.ggnn_gwm(in_arrays[0], in_arrays[1], in_arrays[2])

            ind = in_arrays[4]
            adj = in_arrays[1][:, :4, :, :]

            batch_size = h.shape[0]
            graph_size = h.shape[1]
            hidden_size = h.shape[2]

            h = h.reshape(batch_size, 1, -1, hidden_size) + h.reshape(
                batch_size, -1, 1, hidden_size)
            h = h.reshape(batch_size, -1, hidden_size)
            h1 = model.nn1(h)
            h1 = h1.reshape(batch_size, graph_size, graph_size, 5)

            for x in range(batch_size):
                index = ind[x]
                action_pred = []
                for i in range(graph_size):
                    for j in range(i + 1, graph_size):
                        if adj[x, 0, i, j] == -1:
                            break
                        else:
                            bt_pred = h1[x, i, j].data.argmax()
                            if bt_pred == 0:
                                if adj[x, :, i, j].sum() != 0:
                                    action_pred.append(
                                        str(i) + '-' + str(j) + '-' +
                                        str(bt_pred))
                            else:
                                if adj[x, bt_pred - 1, i, j] == 0:
                                    action_pred.append(
                                        str(i) + '-' + str(j) + '-' +
                                        str(bt_pred))
                with open(args.out + 'inf_' + args.snapshot + '.txt',
                          'a') as file:
                    file.write(str(index))
                    for a_p in action_pred:
                        file.write('\t' + a_p)
                    file.write('\n')