Ejemplo n.º 1
0
def set_up_predictor(method, n_unit, conv_layers, class_num, scaler):
    """Sets up the predictor, consisting of a graph convolution network and
    a multilayer perceptron.

    Args:
        method (str): Method name.
        n_unit (int): Number of hidden units.
        conv_layers (int): Number of convolutional layers for the graph
            convolution network.
        class_num (int): Number of output classes.
    Returns:
        predictor (chainer.Chain): An instance of the selected predictor.
    """
    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)

    if method == 'nfp':
        print('Training an NFP predictor...')
        nfp = NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(nfp, mlp, scaler)
    elif method == 'ggnn':
        print('Training a GGNN predictor...')
        ggnn = GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(ggnn, mlp, scaler)
    elif method == 'schnet':
        print('Training an SchNet predictor...')
        schnet = SchNet(out_dim=class_num,
                        hidden_dim=n_unit,
                        n_layers=conv_layers)
        predictor = GraphConvPredictor(schnet, None, scaler)
    elif method == 'weavenet':
        print('Training a WeaveNet predictor...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers

        weavenet = WeaveNet(weave_channels=weave_channels,
                            hidden_dim=n_unit,
                            n_sub_layer=n_sub_layer,
                            n_atom=n_atom)
        predictor = GraphConvPredictor(weavenet, mlp, scaler)
    elif method == 'rsgcn':
        print('Training an RSGCN predictor...')
        rsgcn = RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(rsgcn, mlp, scaler)
    elif method == 'relgcn':
        print('Use Relational GCN predictor...')
        num_edge_type = 4
        relgcn = RelGCN(out_channels=n_unit,
                        num_edge_type=num_edge_type,
                        scale_adj=True)
        predictor = GraphConvPredictor(relgcn, mlp, scaler)
    elif method == 'relgat':
        print('Train Relational GAT predictor...')
        relgat = RelGAT(out_dim=n_unit,
                        hidden_dim=n_unit,
                        n_layers=conv_layers)
        predictor = GraphConvPredictor(relgat, mlp, scaler)
    else:
        raise ValueError('[ERROR] Invalid method: {}'.format(method))
    return predictor
Ejemplo n.º 2
0
def build_predictor(method, n_unit, conv_layers, class_num):
    if method == 'nfp':
        print('Use NFP predictor...')
        predictor = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Use GGNN predictor...')
        predictor = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Use SchNet predictor...')
        predictor = SchNet(out_dim=class_num,
                           hidden_dim=n_unit,
                           n_layers=conv_layers,
                           readout_hidden_dim=n_unit)
    elif method == 'weavenet':
        print('Use WeaveNet predictor...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        predictor = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid predictor: method={}'.format(method))
    return predictor
Ejemplo n.º 3
0
def set_up_predictor(method,
                     fp_hidden_dim,
                     fp_out_dim,
                     conv_layers,
                     concat_hidden,
                     fp_dropout_rate,
                     net_hidden_dims,
                     class_num,
                     sim_method='mlp'):
    if sim_method == 'mlp':
        mlp = MLP(out_dim=class_num, hidden_dims=net_hidden_dims)
    elif sim_method == 'cosine':
        mlp = Cosine()
    else:
        raise ValueError(
            '[ERROR] Invalid similarity method: {}'.format(method))
    print('Using {} as similarity predictor with hidden_dims {}...'.format(
        sim_method, net_hidden_dims))

    if method == 'nfp':
        print('Training an NFP predictor...')
        nfp = NFP(out_dim=fp_out_dim,
                  hidden_dim=fp_hidden_dim,
                  n_layers=conv_layers,
                  concat_hidden=concat_hidden)
        predictor = GraphConvPredictorForPair(nfp, mlp)
    elif method == 'ggnn':
        print('Training a GGNN predictor...')
        ggnn = GGNN(out_dim=fp_out_dim,
                    hidden_dim=fp_hidden_dim,
                    n_layers=conv_layers,
                    concat_hidden=concat_hidden,
                    dropout_rate=fp_dropout_rate)
        predictor = GraphConvPredictorForPair(ggnn, mlp)
    elif method == 'schnet':
        print('Training an SchNet predictor...')
        schnet = SchNet(out_dim=class_num,
                        hidden_dim=fp_hidden_dim,
                        n_layers=conv_layers)
        predictor = GraphConvPredictorForPair(schnet, None)
    elif method == 'weavenet':
        print('Training a WeaveNet predictor...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers

        weavenet = WeaveNet(weave_channels=weave_channels,
                            hidden_dim=fp_hidden_dim,
                            n_sub_layer=n_sub_layer,
                            n_atom=n_atom)
        predictor = GraphConvPredictorForPair(weavenet, mlp)
    elif method == 'rsgcn':
        print('Training an RSGCN predictor...')
        rsgcn = RSGCN(out_dim=fp_out_dim,
                      hidden_dim=fp_hidden_dim,
                      n_layers=conv_layers)
        predictor = GraphConvPredictorForPair(rsgcn, mlp)
    else:
        raise ValueError('[ERROR] Invalid method: {}'.format(method))
    return predictor
Ejemplo n.º 4
0
def set_up_predictor(method, n_unit, conv_layers, class_num):
    """Sets up the graph convolution network  predictor.

    Args:
        method: Method name. Currently, the supported ones are `nfp`, `ggnn`,
                `schnet`, `weavenet` and `rsgcn`.
        n_unit: Number of hidden units.
        conv_layers: Number of convolutional layers for the graph convolution
                     network.
        class_num: Number of output classes.

    Returns:
        An instance of the selected predictor.
    """

    predictor = None
    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)

    if method == 'nfp':
        print('Training an NFP predictor...')
        nfp = NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(nfp, mlp)
    elif method == 'ggnn':
        print('Training a GGNN predictor...')
        ggnn = GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(ggnn, mlp)
    elif method == 'schnet':
        print('Training an SchNet predictor...')
        schnet = SchNet(out_dim=class_num,
                        hidden_dim=n_unit,
                        n_layers=conv_layers)
        predictor = GraphConvPredictor(schnet, None)
    elif method == 'weavenet':
        print('Training a WeaveNet predictor...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers

        weavenet = WeaveNet(weave_channels=weave_channels,
                            hidden_dim=n_unit,
                            n_sub_layer=n_sub_layer,
                            n_atom=n_atom)
        predictor = GraphConvPredictor(weavenet, mlp)
    elif method == 'rsgcn':
        print('Training an RSGCN predictor...')
        rsgcn = RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(rsgcn, mlp)
    elif method == 'relgcn':
        print('Training an RelGCN predictor...')
        num_edge_type = 4
        relgcn = RelGCN(out_channels=class_num,
                        num_edge_type=num_edge_type,
                        scale_adj=True)
        predictor = GraphConvPredictor(relgcn, None)
    else:
        raise ValueError('[ERROR] Invalid method: {}'.format(method))
    return predictor
Ejemplo n.º 5
0
def build_predictor(method, n_unit, conv_layers, class_num):
    if method == 'nfp':
        print('Use NFP predictor...')
        predictor = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Use GGNN predictor...')
        predictor = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Use SchNet predictor...')
        # MLP layer is not necessary for SchNet
        predictor = GraphConvPredictor(
            SchNet(out_dim=class_num,
                   hidden_dim=n_unit,
                   n_layers=conv_layers,
                   readout_hidden_dim=n_unit), None)
    elif method == 'weavenet':
        print('Use WeaveNet predictor...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        predictor = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Use RSGCN predictor...')
        predictor = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'relgcn':
        print('Use Relational GCN predictor...')
        num_edge_type = 4
        predictor = GraphConvPredictor(
            RelGCN(out_channels=n_unit,
                   num_edge_type=num_edge_type,
                   scale_adj=True), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'relgat':
        print('Use GAT predictor...')
        predictor = GraphConvPredictor(
            RelGAT(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid predictor: method={}'.format(method))
    return predictor
Ejemplo n.º 6
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(
        description='Regression with own dataset.')
    parser.add_argument('--datafile', type=str, default='dataset.csv')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        nargs='+',
                        default=['value1', 'value2'],
                        help='target label for regression')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--protocol', type=int, default=2)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        sys.exit("Error: No target label is specified.")

    # Dataset preparation
    # Postprocess is required for regression task
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES')
    dataset = parser.parse(args.datafile)["dataset"]

    if args.scale == 'standardize':
        # Standard Scaler for labels
        scaler = StandardScaler()
        labels = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (labels, )))
    else:
        # Not use scaler
        scaler = None

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val,
                                        args.batchsize,
                                        repeat=False,
                                        shuffle=False)

    regressor = Regressor(
        model,
        lossfun=F.mean_squared_error,
        metrics_fun={'abs_error': ScaledAbsError(scaler=scaler)},
        device=args.gpu)

    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    regressor,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    # Note that original scale absolute errors are reported in
    # (validation/)main/abs_error
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss',
            'validation/main/abs_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save regressor's parameters ---
    protocol = args.protocol
    model_path = os.path.join(args.out, 'model.npz')
    print('saving trained model to {}'.format(model_path))
    serializers.save_npz(model_path, regressor)
    if scaler is not None:
        with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f:
            pickle.dump(scaler, f, protocol=protocol)

    # Example of prediction using trained model
    smiles = 'c1ccccc1'
    mol = Chem.MolFromSmiles(smiles)
    preprocessor = preprocess_method_dict[method]()
    standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol)
    input_features = preprocessor.get_input_features(mol)
    atoms, adjs = concat_mols([input_features], device=args.gpu)
    prediction = model(atoms, adjs).data[0]
    if scaler is not None:
        prediction = scaler.inverse_transform(prediction)
    print('Prediction for {}:'.format(smiles))
    for i, label in enumerate(args.label):
        print('{}: {}'.format(label, prediction[i]))
Ejemplo n.º 7
0
def set_up_predictor(method, n_unit, conv_layers, class_num):
    """Sets up the predictor, consisting of a graph convolution network and
    a multilayer perceptron.
    Args:
        method: Method name. See `parse_arguments`.
        n_unit: Number of hidden units.
        conv_layers: Number of convolutional layers for the graph convolution
                     network.
        class_num: Number of output classes.
    Returns:
        An instance of the selected predictor.
    """

    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)

    if method == 'nfp':
        print('Training an NFP predictor...')
        nfp = NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        return GraphConvPredictor(nfp, mlp)
    elif method == 'nfp_gwm':
        print('Training an NFP+GWM predictor...')
        nfp_gwm = NFP_GWM(out_dim=n_unit,
                          hidden_dim=n_unit,
                          hidden_dim_super=n_unit,
                          n_layers=conv_layers,
                          dropout_ratio=0.5)
        return GraphConvPredictorForGWM(nfp_gwm, mlp)
    elif method == 'ggnn':
        print('Training a GGNN predictor...')
        ggnn = GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        return GraphConvPredictor(ggnn, mlp)
    elif method == 'ggnn_gwm':
        print('Train GGNN+GWM model...')
        ggnn_gwm = GGNN_GWM(out_dim=n_unit,
                            hidden_dim=n_unit,
                            hidden_dim_super=n_unit,
                            n_layers=conv_layers,
                            dropout_ratio=0.5,
                            weight_tying=True)
        return GraphConvPredictorForGWM(ggnn_gwm, mlp)
    elif method == 'schnet':
        print('Training an SchNet predictor...')
        schnet = SchNet(out_dim=class_num,
                        hidden_dim=n_unit,
                        n_layers=conv_layers)
        return GraphConvPredictor(schnet, None)
    elif method == 'weavenet':
        print('Training a WeaveNet predictor...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers

        weavenet = WeaveNet(weave_channels=weave_channels,
                            hidden_dim=n_unit,
                            n_sub_layer=n_sub_layer,
                            n_atom=n_atom)
        return GraphConvPredictor(weavenet, mlp)
    elif method == 'rsgcn':
        print('Training an RSGCN predictor...')
        rsgcn = RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        return GraphConvPredictor(rsgcn, mlp)
    elif method == 'rsgcn_gwm':
        print('Training an RSGCN+GWM predictor...')
        rsgcn_gwm = RSGCN_GWM(out_dim=n_unit,
                              hidden_dim=n_unit,
                              hidden_dim_super=n_unit,
                              n_layers=conv_layers,
                              dropout_ratio=0.5)
        return GraphConvPredictorForGWM(rsgcn_gwm, mlp)
    elif method == 'relgcn':
        print('Training an RelGCN predictor...')
        num_edge_type = 4
        relgcn = RelGCN(out_channels=n_unit,
                        num_edge_type=num_edge_type,
                        scale_adj=True)
        return GraphConvPredictor(relgcn, mlp)
    elif method == 'relgat':
        print('Train Relational GAT model...')
        relgat = RelGAT(out_dim=n_unit,
                        hidden_dim=n_unit,
                        n_layers=conv_layers)
        return GraphConvPredictor(relgat, mlp)
    elif method == 'gin':
        print('Training a GIN predictor...')
        gin = GIN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        return GraphConvPredictor(gin, mlp)
    elif method == 'gin_gwm':
        print('Training a GIN+GWM predictor...')
        gin_gwm = GIN_GWM(out_dim=n_unit,
                          hidden_dim=n_unit,
                          hidden_dim_super=n_unit,
                          n_layers=conv_layers,
                          dropout_ratio=0.5,
                          weight_tying=True)
        return GraphConvPredictorForGWM(gin_gwm, mlp)
    raise ValueError('[ERROR] Invalid method: {}'.format(method))
Ejemplo n.º 8
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Dataset preparation
    dataset = None

    if os.path.exists(cache_dir):
        print('load from cache {}'.format(cache_dir))
        dataset = NumpyTupleDataset.load(os.path.join(cache_dir, 'data.npz'))
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)
        os.makedirs(cache_dir)
        NumpyTupleDataset.save(os.path.join(cache_dir, 'data.npz'), dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        ss = StandardScaler()
        labels = ss.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*dataset.get_datasets()[:-1], labels)

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    def scaled_abs_error(x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if args.scale == 'standardize':
            scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
        elif args.scale == 'none':
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
        return numpy.mean(numpy.absolute(diff), axis=0)[0]

    classifier = L.Classifier(model,
                              lossfun=F.mean_squared_error,
                              accfun=scaled_abs_error)

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()
Ejemplo n.º 9
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--protocol', type=int, default=2)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Dataset preparation
    dataset = None

    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)
    if os.path.exists(dataset_cache_path):
        print('load from cache {}'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        if num_data >= 0:
            # only use first 100 for debug
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor,
                                labels=labels,
                                target_index=target_index)
        else:
            dataset = D.get_qm9(preprocessor, labels=labels)
        os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        ss = StandardScaler()
        labels = ss.fit_transform(dataset.get_datasets()[-1])
    else:
        ss = None
    dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, )))

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    regressor = Regressor(
        model,
        lossfun=F.mean_squared_error,
        metrics_fun={'abs_error': ScaledAbsError(scale=args.scale, ss=ss)},
        device=args.gpu)

    optimizer = O.Adam()
    optimizer.setup(regressor)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    regressor,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss',
            'validation/main/abs_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save regressor & standardscaler ---
    protocol = args.protocol
    regressor.save_pickle(os.path.join(args.out, args.model_filename),
                          protocol=protocol)
    if args.scale == 'standardize':
        with open(os.path.join(args.out, 'ss.pkl'), mode='wb') as f:
            pickle.dump(ss, f, protocol=protocol)
Ejemplo n.º 10
0
                            batch_size=BATCH_SIZE,
                            sampler=val_sampler)
    loader_test = DataLoader(generator_test, batch_size=BATCH_SIZE)

#create models
if model_name == "baseline":
    model = BaselineModel(input_len, drop_rate=dropout)
elif model_name == 'nfp':
    print('Training an NFP predictor...')
    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)
    nfp = NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
    model = GraphConvPredictor(nfp, mlp)
elif model_name == 'ggnn':
    print('Training a GGNN predictor...')
    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)
    ggnn = GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
    model = GraphConvPredictor(ggnn, mlp)
elif model_name == 'schnet':
    print('Training an SchNet predictor...')
    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)
    schnet = SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers)
    model = GraphConvPredictor(schnet, None)
elif model_name == 'weavenet':
    print('Training a WeaveNet predictor...')
    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)
    n_atom = 20
    n_sub_layer = 1
    weave_channels = [50] * conv_layers
    weavenet = WeaveNet(weave_channels=weave_channels,
                        hidden_dim=n_unit,
                        n_sub_layer=n_sub_layer,
Ejemplo n.º 11
0
def main():
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    dataset_names = list(molnet_default_config.keys())

    parser = argparse.ArgumentParser(description='molnet example')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        default='',
                        help='target label for regression, empty string means '
                        'to predict all property at once')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--dataset',
                        '-d',
                        type=str,
                        choices=dataset_names,
                        default='bbbp')
    parser.add_argument('--protocol', type=int, default=2)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    print('Use {} dataset'.format(dataset_name))

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Dataset preparation
    def get_dataset_paths(cache_dir, num_data):
        filepaths = []
        for filetype in ['train', 'valid', 'test']:
            filename = filetype + '_data'
            if num_data >= 0:
                filename += '_' + str(num_data)
            filename += '.npz'
            filepath = os.path.join(cache_dir, filename)
            filepaths.append(filepath)
        return filepaths

    filepaths = get_dataset_paths(cache_dir, num_data)
    if all([os.path.exists(fpath) for fpath in filepaths]):
        datasets = []
        for fpath in filepaths:
            print('load from cache {}'.format(fpath))
            datasets.append(NumpyTupleDataset.load(fpath))
    else:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        # only use first 100 for debug if num_data >= 0
        target_index = numpy.arange(num_data) if num_data >= 0 else None
        datasets = D.molnet.get_molnet_dataset(dataset_name,
                                               preprocessor,
                                               labels=labels,
                                               target_index=target_index)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        datasets = datasets['dataset']
        for i, fpath in enumerate(filepaths):
            NumpyTupleDataset.save(fpath, datasets[i])

    train, val, _ = datasets

    # Network
    if method == 'nfp':
        print('Train NFP model...')
        predictor = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        predictor = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        predictor = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        predictor = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        predictor = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val,
                                        args.batchsize,
                                        repeat=False,
                                        shuffle=False)

    metrics_fun = molnet_default_config[dataset_name]['metrics']
    loss_fun = molnet_default_config[dataset_name]['loss']
    task_type = molnet_default_config[dataset_name]['task_type']
    if task_type == 'regression':
        model = Regressor(predictor,
                          lossfun=loss_fun,
                          metrics_fun=metrics_fun,
                          device=args.gpu)
        # TODO(nakago): Use standard scaler for regression task
    elif task_type == 'classification':
        model = Classifier(predictor,
                           lossfun=loss_fun,
                           metrics_fun=metrics_fun,
                           device=args.gpu)
    else:
        raise NotImplementedError(
            'Not implemented task_type = {}'.format(task_type))

    optimizer = optimizers.Adam()
    optimizer.setup(model)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter, model, device=args.gpu, converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    print_report_targets = ['epoch', 'main/loss', 'validation/main/loss']
    if metrics_fun is not None and type(metrics_fun) == dict:
        for m_k in metrics_fun.keys():
            print_report_targets.append('main/' + m_k)
            print_report_targets.append('validation/main/' + m_k)
    if task_type == 'classification':
        # Evaluation for train data takes time, skip for now.
        # trainer.extend(ROCAUCEvaluator(
        #     train_iter, model, device=args.gpu, eval_func=predictor,
        #     converter=concat_mols, name='train', raise_value_error=False))
        # print_report_targets.append('train/main/roc_auc')
        trainer.extend(
            ROCAUCEvaluator(val_iter,
                            model,
                            device=args.gpu,
                            eval_func=predictor,
                            converter=concat_mols,
                            name='val',
                            raise_value_error=False))
        print_report_targets.append('val/main/roc_auc')
    print_report_targets.append('elapsed_time')
    trainer.extend(E.PrintReport(print_report_targets))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save model ---
    protocol = args.protocol
    model.save_pickle(os.path.join(args.out, args.model_filename),
                      protocol=protocol)
Ejemplo n.º 12
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(
        description='Regression with own dataset.')
    parser.add_argument('datafile', type=str)
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        default='nfp')
    parser.add_argument('--label', '-l', nargs='+',
                        help='target label for regression')
    parser.add_argument('--scale', type=str, choices=scale_list,
                        default='standardize', help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        sys.exit("Error: No target label is specified.")

    # Dataset preparation
    # Postprocess is required for regression task
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels, smiles_col='SMILES')
    dataset = parser.parse(args.datafile)["dataset"]

    if args.scale == 'standardize':
        # Standard Scaler for labels
        ss = StandardScaler()
        labels = ss.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,)))

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(NFP(out_dim=n_unit, hidden_dim=n_unit,
                                       n_layers=conv_layers),
                                   MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(GGNN(out_dim=n_unit, hidden_dim=n_unit,
                                        n_layers=conv_layers),
                                   MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer, n_atom=n_atom),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val, args.batchsize,
                                repeat=False, shuffle=False)

    def scaled_abs_error(x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if args.scale == 'standardize':
            scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
        elif args.scale == 'none':
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
        return numpy.mean(numpy.absolute(diff), axis=0)[0]

    classifier = L.Classifier(model, lossfun=F.mean_squared_error,
                              accfun=scaled_abs_error)

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.Evaluator(val_iter, classifier, device=args.gpu,
                               converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    # Note that scaled errors are reported as (validation/)main/accuracy
    trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/accuracy',
                                  'validation/main/loss',
                                  'validation/main/accuracy',
                                  'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Example of prediction using trained model
    smiles = 'c1ccccc1'
    mol = Chem.MolFromSmiles(smiles)
    preprocessor = preprocess_method_dict[method]()
    standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol)
    input_features = preprocessor.get_input_features(mol)
    atoms, adjs = concat_mols([input_features], device=args.gpu)
    prediction = model(atoms, adjs).data[0]
    print('Prediction for {}:'.format(smiles))
    for i, label in enumerate(args.label):
        print('{}: {}'.format(label, prediction[i]))