コード例 #1
0
def get_molnet_dataframe(dataset_name):
    """Downloads, caches and get the dataframe of MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
    Returns (pandas.DataFrame or tuple):
        DataFrame of dataset without any preprocessing. When the files of
        dataset are seprated, this function returns multiple DataFrame.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError(
            "We don't support {} dataset. Please choose from {}".format(
                dataset_name, list(molnet_default_config.keys())))
    dataset_config = molnet_default_config[dataset_name]
    if dataset_config['dataset_type'] == 'one_file_csv':
        df = pandas.read_csv(get_molnet_filepath(dataset_name))
        return df
    elif dataset_config['dataset_type'] == 'separate_csv':
        train_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'train'))
        valid_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'valid'))
        test_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'test'))
        return train_df, valid_df, test_df
    else:
        raise ValueError('dataset_type={} is not supported'.format(
            dataset_config['dataset_type']))
コード例 #2
0
def parse_arguments():
    # Lists of supported preprocessing methods/models.
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'gin', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm']
#    scale_list = ['standardize', 'none']
    dataset_names = list(molnet_default_config.keys())

    # Set up the argument parser.
    parser = argparse.ArgumentParser(description='Prediction on Molnet.')
    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,
                        default='bbbp',
                        help='name of the dataset that training is run on')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        help='method name', default='nfp')
    parser.add_argument('--label', '-l', type=str, default='',
                        help='target label for regression; empty string means '
                        'predicting all properties at once')
#    parser.add_argument('--scale', type=str, choices=scale_list,
#                        help='label scaling method', default='standardize')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='id of gpu to use; negative value means running'
                        'the code on cpu')
    parser.add_argument('--in-dir', '-i', type=str, default='result',
                        help='directory to load model data from')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='amount of data to be parsed; -1 indicates '
                        'parsing all data.')
    return parser.parse_args()
コード例 #3
0
def parse_arguments():
    # Lists of supported preprocessing methods/models.
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',
                   'relgat', 'gin', 'gnnfilm', 'megnet',
                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm']
#    scale_list = ['standardize', 'none']
    dataset_names = list(molnet_default_config.keys())

    # Set up the argument parser.
    parser = argparse.ArgumentParser(description='Prediction on Molnet.')
    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,
                        default='bbbp',
                        help='name of the dataset that training is run on')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        help='method name', default='nfp')
    parser.add_argument('--label', '-l', type=str, default='',
                        help='target label for regression; empty string means '
                        'predicting all properties at once')
#    parser.add_argument('--scale', type=str, choices=scale_list,
#                        help='label scaling method', default='standardize')
    parser.add_argument(
        '--device', type=str, default='-1',
        help='Device specifier. Either ChainerX device specifier or an '
             'integer. If non-negative integer, CuPy arrays with specified '
             'device id are used. If negative integer, NumPy arrays are used')
    parser.add_argument('--in-dir', '-i', type=str, default='result',
                        help='directory to load model data from')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='amount of data to be parsed; -1 indicates '
                        'parsing all data.')
    return parser.parse_args()
コード例 #4
0
def parse_arguments():
    # Lists of supported preprocessing methods/models and datasets.
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',
                   'relgat', 'gin', 'gnnfilm', 'megnet',
                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm']
    dataset_names = list(molnet_default_config.keys())
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='molnet example')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        help='method name', default='nfp')
    parser.add_argument('--label', '-l', type=str, default='',
                        help='target label for regression; empty string means '
                        'predicting all properties at once')
    parser.add_argument('--conv-layers', '-c', type=int, default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize', '-b', type=int, default=32,
                        help='batch size')
    parser.add_argument(
        '--device', type=str, default='-1',
        help='Device specifier. Either ChainerX device specifier or an '
             'integer. If non-negative integer, CuPy arrays with specified '
             'device id are used. If negative integer, NumPy arrays are used')
    parser.add_argument('--out', '-o', type=str, default='result',
                        help='path to save the computed model to')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='number of epochs')
    parser.add_argument('--unit-num', '-u', type=int, default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,
                        default='bbbp',
                        help='name of the dataset that training is run on')
    parser.add_argument('--protocol', type=int, default=2,
                        help='pickle protocol version')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='amount of data to be parsed; -1 indicates '
                        'parsing all data.')
    parser.add_argument('--scale', type=str, choices=scale_list,
                        help='label scaling method', default='standardize')
    return parser.parse_args()
コード例 #5
0
ファイル: train_molnet.py プロジェクト: ir5/chainer-chemistry
def parse_arguments():
    # Lists of supported preprocessing methods/models and datasets.
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',
                   'relgat', 'gin', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm']
    dataset_names = list(molnet_default_config.keys())
#    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='molnet example')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        help='method name', default='nfp')
    parser.add_argument('--label', '-l', type=str, default='',
                        help='target label for regression; empty string means '
                        'predicting all properties at once')
    parser.add_argument('--conv-layers', '-c', type=int, default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize', '-b', type=int, default=32,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='id of gpu to use; negative value means running'
                        'the code on cpu')
    parser.add_argument('--out', '-o', type=str, default='result',
                        help='path to save the computed model to')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='number of epochs')
    parser.add_argument('--unit-num', '-u', type=int, default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,
                        default='bbbp',
                        help='name of the dataset that training is run on')
    parser.add_argument('--protocol', type=int, default=2,
                        help='pickle protocol version')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='amount of data to be parsed; -1 indicates '
                        'parsing all data.')
#    parser.add_argument('--scale', type=str, choices=scale_list,
#                        help='label scaling method', default='standardize')
    return parser.parse_args()
コード例 #6
0
ファイル: molnet.py プロジェクト: ir5/chainer-chemistry
def get_molnet_dataframe(dataset_name, pdbbind_subset=None):
    """Downloads, caches and get the dataframe of MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        pdbbind_subset (str): PDBbind dataset subset name. If you want to know
            the detail of subset, please refer to `official site
            <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>`
    Returns (pandas.DataFrame or tuple):
        DataFrame of dataset without any preprocessing. When the files of
        dataset are seprated, this function returns multiple DataFrame.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError("We don't support {} dataset. Please choose from {}".
                         format(dataset_name,
                                list(molnet_default_config.keys())))
    if dataset_name == 'pdbbind_grid':
        raise ValueError('pdbbind_grid dataset is not supported. Please ',
                         'choose pdbbind_smiles dataset.')
    dataset_config = molnet_default_config[dataset_name]
    if dataset_config['dataset_type'] == 'one_file_csv':
        df = pandas.read_csv(get_molnet_filepath(
            dataset_name, pdbbind_subset=pdbbind_subset))
        return df
    elif dataset_config['dataset_type'] == 'separate_csv':
        train_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'train'))
        valid_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'valid'))
        test_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'test'))
        return train_df, valid_df, test_df
    else:
        raise ValueError('dataset_type={} is not supported'
                         .format(dataset_config['dataset_type']))
コード例 #7
0
def parse_arguments():
    # Lists of supported preprocessing methods/models and datasets.
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn']
    dataset_names = list(molnet_default_config.keys())
#    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='molnet example')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        help='method name', default='nfp')
    parser.add_argument('--label', '-l', type=str, default='',
                        help='target label for regression; empty string means '
                        'predicting all properties at once')
    parser.add_argument('--conv-layers', '-c', type=int, default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize', '-b', type=int, default=32,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='id of gpu to use; negative value means running'
                        'the code on cpu')
    parser.add_argument('--out', '-o', type=str, default='result',
                        help='path to save the computed model to')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='number of epochs')
    parser.add_argument('--unit-num', '-u', type=int, default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,
                        default='bbbp',
                        help='name of the dataset that training is run on')
    parser.add_argument('--protocol', type=int, default=2,
                        help='pickle protocol version')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='amount of data to be parsed; -1 indicates '
                        'parsing all data.')
#    parser.add_argument('--scale', type=str, choices=scale_list,
#                        help='label scaling method', default='standardize')
    return parser.parse_args()
コード例 #8
0
def get_molnet_dataset(dataset_name,
                       preprocessor=None,
                       labels=None,
                       split=None,
                       frac_train=.8,
                       frac_valid=.1,
                       frac_test=.1,
                       seed=777,
                       return_smiles=False,
                       target_index=None,
                       task_index=0,
                       **kwargs):
    """Downloads, caches and preprocess MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        split (str or BaseSplitter or None): How to split dataset into train,
            validation and test. If `None`, this functions use the splitter
            that is recommended by MoleculeNet. Additionally You can use an
            instance of BaseSplitter or choose it from 'random', 'stratified'
            and 'scaffold'.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
        task_index (int): Target task index in dataset for stratification.
            (Stratified Splitter only)
    Returns (dict):
        Dictionary that contains dataset that is already split into train,
        valid and test dataset and 1-d numpy array with dtype=object(string)
        which is a vector of smiles for each example or `None`.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError(
            "We don't support {} dataset. Please choose from {}".format(
                dataset_name, list(molnet_default_config.keys())))
    dataset_config = molnet_default_config[dataset_name]
    labels = labels or dataset_config['tasks']
    if isinstance(labels, str):
        labels = [
            labels,
        ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    if dataset_config['task_type'] == 'regression':

        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)
    elif dataset_config['task_type'] == 'classification':

        def postprocess_label(label_list):
            label_list = numpy.asarray(label_list)
            label_list[numpy.isnan(label_list)] = -1
            return label_list.astype(numpy.int32)

    parser = CSVFileParser(preprocessor,
                           labels=labels,
                           smiles_col=dataset_config['smiles_columns'],
                           postprocess_label=postprocess_label)
    if dataset_config['dataset_type'] == 'one_file_csv':
        split = dataset_config['split'] if split is None else split

        if isinstance(split, str):
            splitter = split_method_dict[split]()
        elif isinstance(split, BaseSplitter):
            splitter = split
        else:
            raise TypeError("split must be None, str or instance of"
                            " BaseSplitter, but got {}".format(type(split)))

        if isinstance(splitter, ScaffoldSplitter):
            get_smiles = True
        else:
            get_smiles = return_smiles

        result = parser.parse(get_molnet_filepath(dataset_name),
                              return_smiles=get_smiles,
                              target_index=target_index,
                              **kwargs)
        dataset = result['dataset']
        smiles = result['smiles']
        train_ind, valid_ind, test_ind = \
            splitter.train_valid_test_split(dataset, smiles_list=smiles,
                                            task_index=task_index,
                                            frac_train=frac_train,
                                            frac_valid=frac_valid,
                                            frac_test=frac_test, **kwargs)
        train = NumpyTupleDataset(*dataset.features[train_ind])
        valid = NumpyTupleDataset(*dataset.features[valid_ind])
        test = NumpyTupleDataset(*dataset.features[test_ind])

        result['dataset'] = (train, valid, test)
        if return_smiles:
            train_smiles = smiles[train_ind]
            valid_smiles = smiles[valid_ind]
            test_smiles = smiles[test_ind]
            result['smiles'] = (train_smiles, valid_smiles, test_smiles)
        else:
            result['smiles'] = None
    elif dataset_config['dataset_type'] == 'separate_csv':
        result = {}
        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),
                                   return_smiles=return_smiles,
                                   target_index=target_index)
        result['dataset'] = (train_result['dataset'], valid_result['dataset'],
                             test_result['dataset'])
        result['smiles'] = (train_result['smiles'], valid_result['smiles'],
                            test_result['smiles'])
    else:
        raise ValueError('dataset_type={} is not supported'.format(
            dataset_config['dataset_type']))
    return result
コード例 #9
0
ファイル: molnet.py プロジェクト: kicchi/from_physalis
def get_molnet_dataset(dataset_name,
                       preprocessor=None,
                       labels=None,
                       split='random',
                       frac_train=.8,
                       frac_valid=.1,
                       frac_test=.1,
                       seed=777,
                       return_smiles=False,
                       target_index=None):
    from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser
    """Downloads, caches and preprocess MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
    Returns (dict):
        Dictionary that contains dataset that is already splitted into train,
        valid and test dataset and 1-d numpy array with dtype=object(string)
        which is a vector of smiles for each example or `None`.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError(
            "We don't support {} dataset. Please choose from {}".format(
                dataset_name, list(molnet_default_config.keys())))
    dataset_config = molnet_default_config[dataset_name]
    labels = labels or dataset_config['tasks']
    if isinstance(labels, str):
        labels = [
            labels,
        ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    if dataset_config['task_type'] == 'regression':

        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)
    elif dataset_config['task_type'] == 'classification':

        def postprocess_label(label_list):
            label_list = numpy.asarray(label_list)
            label_list[numpy.isnan(label_list)] = -1
            return label_list.astype(numpy.int32)

    parser = CSVFileParser(preprocessor,
                           labels=labels,
                           smiles_col=dataset_config['smiles_columns'],
                           postprocess_label=postprocess_label)
    if dataset_config['dataset_type'] == 'one_file_csv':
        result = parser.parse(get_molnet_filepath(dataset_name),
                              return_smiles=return_smiles,
                              target_index=target_index)
        # TODO(motoki): splitting function or class
        dataset = result['dataset']
        if split == 'random':
            perm = numpy.random.permutation(len(dataset))
            dataset = NumpyTupleDataset(*dataset.features[perm])
            train_data_size = int(len(dataset) * frac_train)
            valid_data_size = int(len(dataset) * frac_valid)
            train = NumpyTupleDataset(*dataset.features[:train_data_size])
            valid = NumpyTupleDataset(
                *dataset.features[train_data_size:train_data_size +
                                  valid_data_size])
            test = NumpyTupleDataset(*dataset.features[train_data_size +
                                                       valid_data_size:])

            result['dataset'] = (train, valid, test)
            if return_smiles:
                smiles = result['smiles'][perm]
                train_smiles = smiles[:train_data_size]
                valid_smiles = smiles[train_data_size:train_data_size +
                                      valid_data_size]
                test_smiles = smiles[train_data_size + valid_data_size:]
                result['smiles'] = (train_smiles, valid_smiles, test_smiles)
            else:
                result['smiles'] = None
        else:
            raise NotImplementedError
    elif dataset_config['dataset_type'] == 'separate_csv':
        result = {}
        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),
                                   return_smiles=return_smiles,
                                   target_index=target_index)
        result['dataset'] = (train_result['dataset'], valid_result['dataset'],
                             test_result['dataset'])
        result['smiles'] = (train_result['smiles'], valid_result['smiles'],
                            test_result['smiles'])
    else:
        raise NotImplementedError
    return result
コード例 #10
0
def main():
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    dataset_names = list(molnet_default_config.keys())

    parser = argparse.ArgumentParser(description='molnet example')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        default='',
                        help='target label for regression, empty string means '
                        'to predict all property at once')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--dataset',
                        '-d',
                        type=str,
                        choices=dataset_names,
                        default='bbbp')
    parser.add_argument('--protocol', type=int, default=2)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    print('Use {} dataset'.format(dataset_name))

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Dataset preparation
    def get_dataset_paths(cache_dir, num_data):
        filepaths = []
        for filetype in ['train', 'valid', 'test']:
            filename = filetype + '_data'
            if num_data >= 0:
                filename += '_' + str(num_data)
            filename += '.npz'
            filepath = os.path.join(cache_dir, filename)
            filepaths.append(filepath)
        return filepaths

    filepaths = get_dataset_paths(cache_dir, num_data)
    if all([os.path.exists(fpath) for fpath in filepaths]):
        datasets = []
        for fpath in filepaths:
            print('load from cache {}'.format(fpath))
            datasets.append(NumpyTupleDataset.load(fpath))
    else:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        # only use first 100 for debug if num_data >= 0
        target_index = numpy.arange(num_data) if num_data >= 0 else None
        datasets = D.molnet.get_molnet_dataset(dataset_name,
                                               preprocessor,
                                               labels=labels,
                                               target_index=target_index)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        datasets = datasets['dataset']
        for i, fpath in enumerate(filepaths):
            NumpyTupleDataset.save(fpath, datasets[i])

    train, val, _ = datasets

    # Network
    if method == 'nfp':
        print('Train NFP model...')
        predictor = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        predictor = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        predictor = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        predictor = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        predictor = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val,
                                        args.batchsize,
                                        repeat=False,
                                        shuffle=False)

    metrics_fun = molnet_default_config[dataset_name]['metrics']
    loss_fun = molnet_default_config[dataset_name]['loss']
    task_type = molnet_default_config[dataset_name]['task_type']
    if task_type == 'regression':
        model = Regressor(predictor,
                          lossfun=loss_fun,
                          metrics_fun=metrics_fun,
                          device=args.gpu)
        # TODO(nakago): Use standard scaler for regression task
    elif task_type == 'classification':
        model = Classifier(predictor,
                           lossfun=loss_fun,
                           metrics_fun=metrics_fun,
                           device=args.gpu)
    else:
        raise NotImplementedError(
            'Not implemented task_type = {}'.format(task_type))

    optimizer = optimizers.Adam()
    optimizer.setup(model)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter, model, device=args.gpu, converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    print_report_targets = ['epoch', 'main/loss', 'validation/main/loss']
    if metrics_fun is not None and type(metrics_fun) == dict:
        for m_k in metrics_fun.keys():
            print_report_targets.append('main/' + m_k)
            print_report_targets.append('validation/main/' + m_k)
    if task_type == 'classification':
        # Evaluation for train data takes time, skip for now.
        # trainer.extend(ROCAUCEvaluator(
        #     train_iter, model, device=args.gpu, eval_func=predictor,
        #     converter=concat_mols, name='train', raise_value_error=False))
        # print_report_targets.append('train/main/roc_auc')
        trainer.extend(
            ROCAUCEvaluator(val_iter,
                            model,
                            device=args.gpu,
                            eval_func=predictor,
                            converter=concat_mols,
                            name='val',
                            raise_value_error=False))
        print_report_targets.append('val/main/roc_auc')
    print_report_targets.append('elapsed_time')
    trainer.extend(E.PrintReport(print_report_targets))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save model ---
    protocol = args.protocol
    model.save_pickle(os.path.join(args.out, args.model_filename),
                      protocol=protocol)
コード例 #11
0
ファイル: molnet.py プロジェクト: ir5/chainer-chemistry
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None,
                       split=None, frac_train=.8, frac_valid=.1,
                       frac_test=.1, seed=777, return_smiles=False,
                       return_pdb_id=False, target_index=None, task_index=0,
                       **kwargs):
    """Downloads, caches and preprocess MoleculeNet dataset.

    Args:
        dataset_name (str): MoleculeNet dataset name. If you want to know the
            detail of MoleculeNet, please refer to
            `official site <http://moleculenet.ai/datasets-1>`_
            If you would like to know what dataset_name is available for
            chainer_chemistry, please refer to `molnet_config.py`.
        preprocessor (BasePreprocessor): Preprocessor.
            It should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        split (str or BaseSplitter or None): How to split dataset into train,
            validation and test. If `None`, this functions use the splitter
            that is recommended by MoleculeNet. Additionally You can use an
            instance of BaseSplitter or choose it from 'random', 'stratified'
            and 'scaffold'.
        return_smiles (bool): If set to ``True``,
            smiles array is also returned.
        return_pdb_id (bool): If set to ``True``,
            PDB ID array is also returned.
            This argument is only used when you select 'pdbbind_smiles'.
        target_index (list or None): target index list to partially extract
            dataset. If `None` (default), all examples are parsed.
        task_index (int): Target task index in dataset for stratification.
            (Stratified Splitter only)
    Returns (dict):
        Dictionary that contains dataset that is already split into train,
        valid and test dataset and 1-d numpy array with dtype=object(string)
        which is a vector of smiles for each example or `None`.

    """
    if dataset_name not in molnet_default_config:
        raise ValueError("We don't support {} dataset. Please choose from {}".
                         format(dataset_name,
                                list(molnet_default_config.keys())))

    if dataset_name == 'pdbbind_grid':
        pdbbind_subset = kwargs.get('pdbbind_subset')
        return get_pdbbind_grid(pdbbind_subset, split=split,
                                frac_train=frac_train, frac_valid=frac_valid,
                                frac_test=frac_test, task_index=task_index)
    if dataset_name == 'pdbbind_smiles':
        pdbbind_subset = kwargs.get('pdbbind_subset')
        time_list = kwargs.get('time_list')
        return get_pdbbind_smiles(pdbbind_subset, preprocessor=preprocessor,
                                  labels=labels, split=split,
                                  frac_train=frac_train, frac_valid=frac_valid,
                                  frac_test=frac_test,
                                  return_smiles=return_smiles,
                                  return_pdb_id=return_pdb_id,
                                  target_index=target_index,
                                  task_index=task_index,
                                  time_list=time_list)

    dataset_config = molnet_default_config[dataset_name]
    labels = labels or dataset_config['tasks']
    if isinstance(labels, str):
        labels = [labels, ]

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()

    if dataset_config['task_type'] == 'regression':
        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)
    elif dataset_config['task_type'] == 'classification':
        def postprocess_label(label_list):
            label_list = numpy.asarray(label_list)
            label_list[numpy.isnan(label_list)] = -1
            return label_list.astype(numpy.int32)

    parser = CSVFileParser(preprocessor, labels=labels,
                           smiles_col=dataset_config['smiles_columns'],
                           postprocess_label=postprocess_label)
    if dataset_config['dataset_type'] == 'one_file_csv':
        split = dataset_config['split'] if split is None else split

        if isinstance(split, str):
            splitter = split_method_dict[split]()
        elif isinstance(split, BaseSplitter):
            splitter = split
        else:
            raise TypeError("split must be None, str or instance of"
                            " BaseSplitter, but got {}".format(type(split)))

        if isinstance(splitter, ScaffoldSplitter):
            get_smiles = True
        else:
            get_smiles = return_smiles

        result = parser.parse(get_molnet_filepath(dataset_name),
                              return_smiles=get_smiles,
                              target_index=target_index, **kwargs)
        dataset = result['dataset']
        smiles = result['smiles']
        train_ind, valid_ind, test_ind = \
            splitter.train_valid_test_split(dataset, smiles_list=smiles,
                                            task_index=task_index,
                                            frac_train=frac_train,
                                            frac_valid=frac_valid,
                                            frac_test=frac_test, **kwargs)
        train = NumpyTupleDataset(*dataset.features[train_ind])
        valid = NumpyTupleDataset(*dataset.features[valid_ind])
        test = NumpyTupleDataset(*dataset.features[test_ind])

        result['dataset'] = (train, valid, test)
        if return_smiles:
            train_smiles = smiles[train_ind]
            valid_smiles = smiles[valid_ind]
            test_smiles = smiles[test_ind]
            result['smiles'] = (train_smiles, valid_smiles, test_smiles)
        else:
            result['smiles'] = None
    elif dataset_config['dataset_type'] == 'separate_csv':
        result = {}
        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),
                                    return_smiles=return_smiles,
                                    target_index=target_index)
        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),
                                   return_smiles=return_smiles,
                                   target_index=target_index)
        result['dataset'] = (train_result['dataset'], valid_result['dataset'],
                             test_result['dataset'])
        result['smiles'] = (train_result['smiles'], valid_result['smiles'],
                            test_result['smiles'])
    else:
        raise ValueError('dataset_type={} is not supported'
                         .format(dataset_config['dataset_type']))
    return result