Beispiel #1
0
 def check_invalid_key(self, gpu, label_key):
     link = Classifier(links.Linear(10, 3), label_key=label_key)
     if gpu:
         link.to_gpu()
     x = chainer.Variable(link.xp.asarray(self.x))
     with pytest.raises(ValueError):
         link(x)
Beispiel #2
0
    def check_predict_proba(self, device):
        clf = Classifier(self.predictor, device=device)
        actual_y = clf.predict_proba(self.x)
        assert actual_y.shape == (3, 2)
        assert actual_y.dtype == numpy.float32
        assert numpy.alltrue(0 <= actual_y)
        assert numpy.alltrue(actual_y <= 1.)

        actual_t = numpy.argmax(actual_y, axis=1)
        assert numpy.alltrue(actual_t == self.t)
Beispiel #3
0
    def check_call(self, gpu, label_key, args, kwargs, model_args,
                   model_kwargs, metrics_fun, compute_metrics):
        init_kwargs = {'label_key': label_key}
        if metrics_fun is not None:
            init_kwargs['metrics_fun'] = metrics_fun
        link = Classifier(chainer.Link(), **init_kwargs)

        if gpu:
            xp = cuda.cupy
            link.to_gpu()
        else:
            xp = numpy

        link.compute_metrics = compute_metrics

        y = chainer.Variable(self.y)
        link.predictor = mock.MagicMock(return_value=y)

        loss = link(*args, **kwargs)
        link.predictor.assert_called_with(*model_args, **model_kwargs)

        assert hasattr(link, 'y')
        assert link.y is not None

        assert hasattr(link, 'loss')
        xp.testing.assert_allclose(link.loss.data, loss.data)

        assert hasattr(link, 'metrics')
        if compute_metrics:
            assert link.metrics is not None
        else:
            assert link.metrics is None
Beispiel #4
0
    def test_report_key(self, metrics_fun, compute_metrics):
        repo = chainer.Reporter()

        link = Classifier(predictor=DummyPredictor(), metrics_fun=metrics_fun)
        link.compute_metrics = compute_metrics
        repo.add_observer('target', link)
        with repo:
            observation = {}
            with reporter.report_scope(observation):
                link(self.x, self.t)

        # print('observation ', observation)
        actual_keys = set(observation.keys())
        if compute_metrics:
            if metrics_fun is None:
                assert set(['target/loss']) == actual_keys
            elif isinstance(metrics_fun, dict):
                assert set(['target/loss', 'target/user_key']) == actual_keys
            elif callable(metrics_fun):
                assert set(['target/loss', 'target/accuracy']) == actual_keys
            else:
                raise TypeError()
        else:
            assert set(['target/loss']) == actual_keys
Beispiel #5
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers

    task_type = molnet_default_config[dataset_name]['task_type']
    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }

    print('Using dataset: {}...'.format(dataset_name))

    # Set up some useful variables that will be used later on.
    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Load the train and validation parts of the dataset.
    filenames = [
        dataset_part_filename(p, num_data) for p in ['train', 'valid']
    ]

    paths = [os.path.join(cache_dir, f) for f in filenames]
    if all([os.path.exists(path) for path in paths]):
        dataset_parts = []
        for path in paths:
            print('Loading cached dataset from {}.'.format(path))
            dataset_parts.append(NumpyTupleDataset.load(path))
    else:
        dataset_parts = download_entire_dataset(dataset_name, num_data, labels,
                                                method, cache_dir)
    train, valid = dataset_parts[0], dataset_parts[1]

    #    # Scale the label values, if necessary.
    #    if args.scale == 'standardize':
    #        if task_type == 'regression':
    #            print('Applying standard scaling to the labels.')
    #            datasets, scaler = standardize_dataset_labels(datasets)
    #        else:
    #            print('Label scaling is not available for classification tasks.')
    #    else:
    #        print('No label scaling was selected.')
    #        scaler = None

    # Set up the predictor.
    predictor = set_up_predictor(method, n_unit, conv_layers, class_num)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid,
                                          args.batchsize,
                                          repeat=False,
                                          shuffle=False)

    # Load metrics for the current dataset.
    metrics = molnet_default_config[dataset_name]['metrics']
    metrics_fun = {
        k: v
        for k, v in metrics.items() if isinstance(v, types.FunctionType)
    }
    loss_fun = molnet_default_config[dataset_name]['loss']

    if task_type == 'regression':
        model = Regressor(predictor,
                          lossfun=loss_fun,
                          metrics_fun=metrics_fun,
                          device=args.gpu)
        # TODO: Use standard scaler for regression task
    elif task_type == 'classification':
        model = Classifier(predictor,
                           lossfun=loss_fun,
                           metrics_fun=metrics_fun,
                           device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(model)

    # Save model-related output to this directory.
    model_dir = os.path.join(args.out, os.path.basename(cache_dir))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir)
    trainer.extend(
        E.Evaluator(valid_iter, model, device=args.gpu, converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())

    # Report various metrics.
    print_report_targets = ['epoch', 'main/loss', 'validation/main/loss']
    for metric_name, metric_fun in metrics.items():
        if isinstance(metric_fun, types.FunctionType):
            print_report_targets.append('main/' + metric_name)
            print_report_targets.append('validation/main/' + metric_name)
        elif issubclass(metric_fun, BatchEvaluator):
            trainer.extend(
                metric_fun(valid_iter,
                           model,
                           device=args.gpu,
                           eval_func=predictor,
                           converter=concat_mols,
                           name='val',
                           raise_value_error=False))
            print_report_targets.append('val/main/' + metric_name)
        else:
            raise TypeError('{} is not a supported metrics function.'.format(
                type(metrics_fun)))
    print_report_targets.append('elapsed_time')

    trainer.extend(E.PrintReport(print_report_targets))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the model's parameters.
    model_path = os.path.join(model_dir, model_filename[task_type])
    print('Saving the trained model to {}...'.format(model_path))
    model.save_pickle(model_path, protocol=args.protocol)
Beispiel #6
0
def main():
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    dataset_names = list(molnet_default_config.keys())

    parser = argparse.ArgumentParser(description='molnet example')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        default='',
                        help='target label for regression, empty string means '
                        'to predict all property at once')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--dataset',
                        '-d',
                        type=str,
                        choices=dataset_names,
                        default='bbbp')
    parser.add_argument('--protocol', type=int, default=2)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    print('Use {} dataset'.format(dataset_name))

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Dataset preparation
    def get_dataset_paths(cache_dir, num_data):
        filepaths = []
        for filetype in ['train', 'valid', 'test']:
            filename = filetype + '_data'
            if num_data >= 0:
                filename += '_' + str(num_data)
            filename += '.npz'
            filepath = os.path.join(cache_dir, filename)
            filepaths.append(filepath)
        return filepaths

    filepaths = get_dataset_paths(cache_dir, num_data)
    if all([os.path.exists(fpath) for fpath in filepaths]):
        datasets = []
        for fpath in filepaths:
            print('load from cache {}'.format(fpath))
            datasets.append(NumpyTupleDataset.load(fpath))
    else:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        # only use first 100 for debug if num_data >= 0
        target_index = numpy.arange(num_data) if num_data >= 0 else None
        datasets = D.molnet.get_molnet_dataset(dataset_name,
                                               preprocessor,
                                               labels=labels,
                                               target_index=target_index)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        datasets = datasets['dataset']
        for i, fpath in enumerate(filepaths):
            NumpyTupleDataset.save(fpath, datasets[i])

    train, val, _ = datasets

    # Network
    if method == 'nfp':
        print('Train NFP model...')
        predictor = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        predictor = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        predictor = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        predictor = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        predictor = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val,
                                        args.batchsize,
                                        repeat=False,
                                        shuffle=False)

    metrics_fun = molnet_default_config[dataset_name]['metrics']
    loss_fun = molnet_default_config[dataset_name]['loss']
    task_type = molnet_default_config[dataset_name]['task_type']
    if task_type == 'regression':
        model = Regressor(predictor,
                          lossfun=loss_fun,
                          metrics_fun=metrics_fun,
                          device=args.gpu)
        # TODO(nakago): Use standard scaler for regression task
    elif task_type == 'classification':
        model = Classifier(predictor,
                           lossfun=loss_fun,
                           metrics_fun=metrics_fun,
                           device=args.gpu)
    else:
        raise NotImplementedError(
            'Not implemented task_type = {}'.format(task_type))

    optimizer = optimizers.Adam()
    optimizer.setup(model)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter, model, device=args.gpu, converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    print_report_targets = ['epoch', 'main/loss', 'validation/main/loss']
    if metrics_fun is not None and type(metrics_fun) == dict:
        for m_k in metrics_fun.keys():
            print_report_targets.append('main/' + m_k)
            print_report_targets.append('validation/main/' + m_k)
    if task_type == 'classification':
        # Evaluation for train data takes time, skip for now.
        # trainer.extend(ROCAUCEvaluator(
        #     train_iter, model, device=args.gpu, eval_func=predictor,
        #     converter=concat_mols, name='train', raise_value_error=False))
        # print_report_targets.append('train/main/roc_auc')
        trainer.extend(
            ROCAUCEvaluator(val_iter,
                            model,
                            device=args.gpu,
                            eval_func=predictor,
                            converter=concat_mols,
                            name='val',
                            raise_value_error=False))
        print_report_targets.append('val/main/roc_auc')
    print_report_targets.append('elapsed_time')
    trainer.extend(E.PrintReport(print_report_targets))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save model ---
    protocol = args.protocol
    model.save_pickle(os.path.join(args.out, args.model_filename),
                      protocol=protocol)
Beispiel #7
0
 def test_predict_gpu(self):
     clf = Classifier(self.predictor, device=0)
     actual_t = clf.predict(self.x)
     assert numpy.alltrue(actual_t == self.t)
def main():
    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir', '-i', type=str, default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--batchsize', '-b', type=int, default=128,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--model-filename', type=str, default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='Number of data to be parsed from parser.'
                             '-1 indicates to parse all data.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']

    _, test, _ = data.load_dataset(method, labels, num_data=args.num_data)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    clf = Classifier.load_pickle(
        os.path.join(args.in_dir, args.model_filename),
        device=args.gpu)  # type: Classifier

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)
    y_pred = clf.predict(test, converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test, converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'
          .format(y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'
          .format(y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'
          .format('TaskID', 'Correct', 'Total', 'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'
              .format(idx, n_correct, n_total, accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(
        test_iterator, clf, converter=concat_mols, device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(
        test_iterator, clf, converter=concat_mols, device=args.gpu,
        eval_func=clf.predictor, name='test', ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(rocauc_result, f)
Beispiel #9
0
def main():
    args = parse_arguments()
    generate_drug_list = True if args.generate_drug_list == 'True' else False

    if args.label:
        labels = args.label
        # class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label(label_list):
        label_arr = np.asarray(label_list, dtype=np.int32)
        return label_arr

    # Apply a preprocessor to the dataset.
    logging.info('Preprocess test dataset...')
    preprocessor = preprocess_method_dict['ggnn']()
    parser = CSVFileParserForPair(preprocessor,
                                  postprocess_label=postprocess_label,
                                  labels=labels,
                                  smiles_cols=['smiles_1', 'smiles_2'])
    test_dict = parser.parse(args.test_datafile,
                             return_smiles_pair_original=True)
    test = test_dict['dataset']
    # test_smiles_pairs = test_dict['smiles_pair_original']
    from chainer.iterators import SerialIterator
    test_iter = SerialIterator(test, 64, repeat=False, shuffle=False)

    out = 'output' + '/' + args.out
    model_path = os.path.join(out, args.model_filename)
    # `load_pickle` is static method, call from Class to get an instance
    print('model_path: {}'.format(model_path))
    from chainer_chemistry.models.prediction import Classifier
    model = Classifier.load_pickle(model_path, args.gpu)

    if args.gpu >= 0:
        model.to_gpu(args.gpu)
    else:
        model.to_cpu()

    snapshot_path = os.path.join(out, args.snapshot)
    try:
        chainer.serializers.load_npz(snapshot_path, model)
    except KeyError as e:
        print(e)

    def eval_func(atoms_1, adj_1, atoms_2, adj_2):
        sample = [
            (atoms_1, adj_1),
            (atoms_2, adj_2),
        ]
        sample = concat_mols(sample)
        atoms_1, adj_1 = sample[0]
        atoms_2, adj_2 = sample[1]
        print(atoms_1, adj_1)
        print('shape 1:', atoms_1.shape, adj_1.shape)
        print('shape 2:', atoms_2.shape, adj_2.shape)
        pred, _ = model.predictor.predict(atoms_1, adj_1, atoms_2, adj_2)
        return pred

    evaluator = MyEvaluator(
        test_iter,
        model,
        converter=concat_mols,
        device=args.gpu,
        eval_func=model.predictor.predict,
        # eval_func=eval_func,
        # mediate_func=models.predictor.mediate_output,
        name='test',
        ignore_labels=-1)
    e1_total, e2_total = evaluator.generate_representations()
    y_total, t_total = evaluator.generate_y_and_t()

    # print('test_datafile: {}'.format(args.test_datafile))
    test_filename = os.path.basename(args.test_datafile).split('.')[0]
    # print('test_filename: {}'.format(test_filename))
    dst_repre_filename = test_filename + '_e' + '.csv'
    dst_repre_filepath = os.path.join(out, dst_repre_filename)
    add_representations(args.test_datafile,
                        dst_repre_filepath,
                        e1_total,
                        e2_total,
                        generate_drug_list=generate_drug_list)

    dst_filename = test_filename + '_e_y' + '.csv'
    dst_filepath = os.path.join(out, dst_filename)
    add_reprensentations_and_y(args.test_datafile, dst_filepath, e1_total,
                               e2_total, y_total)

    perf_dict = dict()
    for metric in [
            'roc_auc', 'prc_auc', 'accuracy', 'precision', 'recall', 'f1'
    ]:
        result = evaluator.compuate(metric=metric)
        perf_dict[metric] = result
        print('{}: {}'.format(metric, result))
    with open(os.path.join(ROOT_PATH, 'eval_result.json'), 'w') as f:
        json.dump(perf_dict, f)
def main():
    parser = argparse.ArgumentParser(
        description='Imbalanced MNIST classification')
    parser.add_argument('--eval-mode',
                        type=int,
                        default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol',
                        type=int,
                        default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--updater-type', type=str, default='standard')
    parser.add_argument('--sampling-size', type=int, default=32)
    parser.add_argument('--optimizer-type', type=str, default='Adam')
    parser.add_argument('--alpha', type=str, default='0.001')

    args = parser.parse_args()
    # Dataset preparation
    train, train_val, val = get_binary_imbalanced_data()

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val,
                                        args.batchsize,
                                        repeat=False,
                                        shuffle=False)

    model = LeNet(n_class=1, binary=True)
    classifier = Classifier(model,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=args.gpu)

    if args.optimizer_type == 'Adam':
        optimizer = optimizers.Adam()
    else:
        optimizer = optimizers.SGD(lr=1e-3)
    optimizer.setup(classifier)

    updater_type = args.updater_type
    if updater_type == 'standard':
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           device=args.gpu)
    elif updater_type == 'proposed':
        updater = Proposed(train_iter,
                           optimizer,
                           device=args.gpu,
                           sampling_size=args.sampling_size)
    elif updater_type == 'LRE':
        x, t = chainer.dataset.concat_examples(train)

        train_val_iter = iterators.SerialIterator(train_val, len(train_val))
        updater = LRE({
            'main': train_iter,
            'val': train_val_iter
        },
                      optimizer,
                      device=args.gpu,
                      alpha=args.alpha)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(E.Evaluator(val_iter, classifier, device=args.gpu))
    trainer.extend(E.LogReport())

    eval_mode = args.eval_mode
    if eval_mode == 0:
        trainer.extend(
            E.PrintReport([
                'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
                'validation/main/accuracy', 'elapsed_time'
            ]))
    elif eval_mode == 1:
        train_eval_iter = iterators.SerialIterator(train,
                                                   args.batchsize,
                                                   repeat=False,
                                                   shuffle=False)
        trainer.extend(
            ROCAUCEvaluator(train_eval_iter,
                            classifier,
                            eval_func=model,
                            device=args.gpu,
                            name='train',
                            pos_labels=1,
                            ignore_labels=-1,
                            raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        trainer.extend(
            ROCAUCEvaluator(val_iter,
                            classifier,
                            eval_func=model,
                            device=args.gpu,
                            name='val',
                            pos_labels=1,
                            ignore_labels=-1))
        trainer.extend(
            E.PrintReport([
                'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
                'validation/main/loss', 'validation/main/accuracy',
                'val/main/roc_auc', 'elapsed_time'
            ]))
    else:
        raise ValueError('Invalid accfun_mode {}'.format(eval_mode))
    trainer.extend(E.ProgressBar(update_interval=10))
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']

    _, test, _ = data.load_dataset(method, labels, num_data=args.num_data)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    clf = Classifier.load_pickle(os.path.join(args.in_dir,
                                              args.model_filename),
                                 device=args.gpu)  # type: Classifier

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)

    y_pred = clf.predict(test,
                         converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test,
                                converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'.format(
        y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'.format('TaskID', 'Correct', 'Total',
                                           'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'.format(idx, n_correct, n_total,
                                                      accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            clf,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(test_iterator,
                                    clf,
                                    converter=concat_mols,
                                    device=args.gpu,
                                    eval_func=clf.predictor,
                                    name='test',
                                    ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(rocauc_result, f)
Beispiel #12
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)

    # Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Re-load the best-validation score snapshot
    # serializers.load_npz(os.path.join(
    #     model_dir, "best_val_" + model_filename[task_type]), model)

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Add more stats
    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result)
    else:
        print('[WARNING] unknown task_type {}.'.format(task_type))

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
def main():
    parser = argparse.ArgumentParser(
        description='Imbalanced MNIST classification')
    parser.add_argument('--eval-mode', type=int, default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out', '-o', type=str, default='result',
                        help='path to output directory')
    parser.add_argument('--epoch', '-e', type=int, default=10,
                        help='number of epochs')
    parser.add_argument('--resume', '-r', type=str, default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency', '-f', type=int, default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol', type=int, default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename', type=str, default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--updater-type', type=str, default='standard')
    parser.add_argument('--sampling-size', type=int, default=32)
    parser.add_argument('--optimizer-type', type=str, default='Adam')
    parser.add_argument('--alpha', type=str, default='0.001')

    args = parser.parse_args()
    # Dataset preparation
    train, train_val, val = get_binary_imbalanced_data()

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val, args.batchsize,
                                        repeat=False, shuffle=False)

    model = LeNet(n_class=1, binary=True)
    classifier = Classifier(model,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=args.gpu)

    if args.optimizer_type == 'Adam':
        optimizer = optimizers.Adam()
    else:
        optimizer = optimizers.SGD(lr=1e-3)
    optimizer.setup(classifier)

    updater_type = args.updater_type
    if updater_type == 'standard':
        updater = training.StandardUpdater(
            train_iter, optimizer, device=args.gpu)
    elif updater_type == 'proposed':
        updater = Proposed(
            train_iter, optimizer, device=args.gpu,
            sampling_size=args.sampling_size)
    elif updater_type == 'LRE':
        x, t = chainer.dataset.concat_examples(train)

        train_val_iter = iterators.SerialIterator(train_val, len(train_val))
        updater = LRE(
            {'main': train_iter, 'val': train_val_iter}, optimizer,
            device=args.gpu, alpha=args.alpha)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(E.Evaluator(val_iter, classifier,
                               device=args.gpu))
    trainer.extend(E.LogReport())

    eval_mode = args.eval_mode
    if eval_mode == 0:
        trainer.extend(E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time']))
    elif eval_mode == 1:
        train_eval_iter = iterators.SerialIterator(train, args.batchsize,
                                                   repeat=False, shuffle=False)
        trainer.extend(ROCAUCEvaluator(
            train_eval_iter, classifier, eval_func=model,
            device=args.gpu, name='train',
            pos_labels=1, ignore_labels=-1, raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        trainer.extend(ROCAUCEvaluator(
            val_iter, classifier, eval_func=model,
            device=args.gpu, name='val',
            pos_labels=1, ignore_labels=-1))
        trainer.extend(E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
            'validation/main/loss', 'validation/main/accuracy',
            'val/main/roc_auc', 'elapsed_time']))
    else:
        raise ValueError('Invalid accfun_mode {}'.format(eval_mode))
    trainer.extend(E.ProgressBar(update_interval=10))
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Beispiel #14
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)

#    # Load the standard scaler parameters, if necessary.
#    if args.scale == 'standardize':
#        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
#        print('Loading scaler parameters from {}.'.format(scaler_path))
#        with open(scaler_path, mode='rb') as f:
#            scaler = pickle.load(f)
#    else:
#        print('No standard scaling was selected.')
#        scaler = None

# Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))


#    # Replace the default predictor with one that scales the output labels.
#    scaled_predictor = ScaledGraphConvPredictor(model.predictor)
#    scaled_predictor.scaler = scaler
#    model.predictor = scaled_predictor

# Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Save the evaluation results.
    with open(os.path.join(model_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)
Beispiel #15
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',
                   'relgat']
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        default='nfp', help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label', '-l', type=str, choices=label_names,
                        default='', help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type', type=str, choices=iterator_type,
                        default='serial', help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--eval-mode', type=int, default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--conv-layers', '-c', type=int, default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize', '-b', type=int, default=32,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out', '-o', type=str, default='result',
                        help='path to output directory')
    parser.add_argument('--epoch', '-e', type=int, default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num', '-u', type=int, default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume', '-r', type=str, default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency', '-f', type=int, default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol', type=int, default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename', type=str, default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='Number of data to be parsed from parser.'
                             '-1 indicates to parse all data.')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)

    # Network
    predictor_ = predictor.build_predictor(
        method, args.unit_num, args.conv_layers, class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(
            train, args.batchsize, train.features[:, -1], ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val, args.batchsize,
                                repeat=False, shuffle=False)

    classifier = Classifier(predictor_,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=args.gpu)

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(
        train_iter, optimizer, device=args.gpu, converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(E.Evaluator(val_iter, classifier,
                               device=args.gpu, converter=concat_mols))
    trainer.extend(E.LogReport())

    eval_mode = args.eval_mode
    if eval_mode == 0:
        trainer.extend(E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time']))
    elif eval_mode == 1:
        train_eval_iter = I.SerialIterator(train, args.batchsize,
                                           repeat=False, shuffle=False)
        trainer.extend(ROCAUCEvaluator(
            train_eval_iter, classifier, eval_func=predictor_,
            device=args.gpu, converter=concat_mols, name='train',
            pos_labels=1, ignore_labels=-1, raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        trainer.extend(ROCAUCEvaluator(
            val_iter, classifier, eval_func=predictor_,
            device=args.gpu, converter=concat_mols, name='val',
            pos_labels=1, ignore_labels=-1))
        trainer.extend(E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
            'validation/main/loss', 'validation/main/accuracy',
            'val/main/roc_auc', 'elapsed_time']))
    else:
        raise ValueError('Invalid accfun_mode {}'.format(eval_mode))
    trainer.extend(E.ProgressBar(update_interval=10))
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    config = {'method': args.method,
              'conv_layers': args.conv_layers,
              'unit_num': args.unit_num,
              'labels': args.label}

    with open(os.path.join(args.out, 'config.json'), 'w') as o:
        o.write(json.dumps(config))

    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Beispiel #16
0
def main():
    label_names = D.get_tox21_label_names()

    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--trainer-snapshot',
                        '-s',
                        type=str,
                        default='',
                        help='Path to the snapshot file of the Chainer '
                        'trainer from which serialized model parameters '
                        'are extracted. If it is not specified, this '
                        'script searches the training result directory '
                        'for the latest snapshot, assuming that '
                        'the naming convension of snapshot files is '
                        '`snapshot_iter_N` where N is the number of '
                        'iterations, which is the default configuration '
                        'of Chainer.')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']
    if labels:
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        class_num = len(label_names)

    _, test, _ = data.load_dataset(method, labels)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    predictor_ = predictor.build_predictor(method, config['unit_num'],
                                           config['conv_layers'], class_num)
    snapshot_file = args.trainer_snapshot
    if not snapshot_file:
        snapshot_file = _find_latest_snapshot(args.in_dir)
    print('Loading pretrained model parameters from {}'.format(snapshot_file))
    chainer.serializers.load_npz(snapshot_file, predictor_,
                                 'updater/model:main/predictor/')

    clf = Classifier(predictor=predictor_,
                     device=args.gpu,
                     lossfun=F.sigmoid_cross_entropy,
                     metrics_fun=F.binary_accuracy)

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)

    y_pred = clf.predict(test,
                         converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test,
                                converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'.format(
        y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'.format('TaskID', 'Correct', 'Total',
                                           'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'.format(idx, n_correct, n_total,
                                                      accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            clf,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(test_iterator,
                                    clf,
                                    converter=concat_mols,
                                    device=args.gpu,
                                    eval_func=predictor_,
                                    name='test',
                                    ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)
def main():
    # Supported preprocessing/network list
    method_list = [
        'nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat'
    ]
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--eval-mode',
                        type=int,
                        default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument(
        '--device',
        type=str,
        default='-1',
        help='Device specifier. Either ChainerX device specifier or an '
        'integer. If non-negative integer, CuPy arrays with specified '
        'device id are used. If negative integer, NumPy arrays are used')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol',
                        type=int,
                        default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)

    # Network
    predictor_ = set_up_predictor(method, args.unit_num, args.conv_layers,
                                  class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))

    device = chainer.get_device(args.device)
    classifier = Classifier(predictor_,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=device)

    extensions_list = []
    eval_mode = args.eval_mode
    if eval_mode == 1:
        train_eval_iter = I.SerialIterator(train,
                                           args.batchsize,
                                           repeat=False,
                                           shuffle=False)

        extensions_list.append(
            ROCAUCEvaluator(train_eval_iter,
                            classifier,
                            eval_func=predictor_,
                            device=device,
                            converter=concat_mols,
                            name='train',
                            pos_labels=1,
                            ignore_labels=-1,
                            raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        val_iter = I.SerialIterator(val,
                                    args.batchsize,
                                    repeat=False,
                                    shuffle=False)
        extensions_list.append(
            ROCAUCEvaluator(val_iter,
                            classifier,
                            eval_func=predictor_,
                            device=device,
                            converter=concat_mols,
                            name='val',
                            pos_labels=1,
                            ignore_labels=-1))

    run_train(classifier,
              train_iter,
              valid=val,
              batch_size=args.batchsize,
              epoch=args.epoch,
              out=args.out,
              device=device,
              converter=concat_mols,
              extensions_list=extensions_list,
              resume_path=args.resume)

    # frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    # trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))
    # trainer.run()

    config = {
        'method': args.method,
        'conv_layers': args.conv_layers,
        'unit_num': args.unit_num,
        'labels': args.label
    }
    save_json(os.path.join(args.out, 'config.json'), config)

    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)


#    # Load the standard scaler parameters, if necessary.
#    if args.scale == 'standardize':
#        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
#        print('Loading scaler parameters from {}.'.format(scaler_path))
#        with open(scaler_path, mode='rb') as f:
#            scaler = pickle.load(f)
#    else:
#        print('No standard scaling was selected.')
#        scaler = None

# Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Proposed by Ishiguro
    # ToDo: consider go/no-go with following modification
    # Re-load the best-validation score snapshot
    serializers.load_npz(
        os.path.join(model_dir, "best_val_" + model_filename[task_type]),
        model)

    #    # Replace the default predictor with one that scales the output labels.
    #    scaled_predictor = ScaledGraphConvPredictor(model.predictor)
    #    scaled_predictor.scaler = scaler
    #    model.predictor = scaled_predictor

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Proposed by Ishiguro: add more stats
    # ToDo: considre go/no-go with the following modification

    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

        save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)
    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(args.in_dir, 'eval_result.json'), rocauc_result)
    else:
        pass

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
Beispiel #19
0
 def setup_class(cls):
     cls.link = Classifier(links.Linear(10, 3))
     cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)
Beispiel #20
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'attention']
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--eval-mode',
                        type=int,
                        default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol',
                        type=int,
                        default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    print("labels : ", labels)
    print("num_data : ", args.num_data)
    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)
    #train data : 11757
    #len(train[0]) : 3
    #len(train[0][0]) : 34
    #len(train[0][1]) : 34
    #len(train[0][1][0]) : 34
    #len(train[0][1]) : 12 ?
    #import pdb;pdb.set_trace()
    #Network
    predictor_ = predictor.build_predictor(method, args.unit_num,
                                           args.conv_layers, class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    classifier = Classifier(predictor_,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=args.gpu)

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.LogReport())

    eval_mode = args.eval_mode
    if eval_mode == 0:
        trainer.extend(
            E.PrintReport([
                'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
                'validation/main/accuracy', 'elapsed_time'
            ]))
    elif eval_mode == 1:
        train_eval_iter = I.SerialIterator(train,
                                           args.batchsize,
                                           repeat=False,
                                           shuffle=False)
        trainer.extend(
            ROCAUCEvaluator(train_eval_iter,
                            classifier,
                            eval_func=predictor_,
                            device=args.gpu,
                            converter=concat_mols,
                            name='train',
                            pos_labels=1,
                            ignore_labels=-1,
                            raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        trainer.extend(
            ROCAUCEvaluator(val_iter,
                            classifier,
                            eval_func=predictor_,
                            device=args.gpu,
                            converter=concat_mols,
                            name='val',
                            pos_labels=1,
                            ignore_labels=-1))
        trainer.extend(
            E.PrintReport([
                'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
                'validation/main/loss', 'validation/main/accuracy',
                'val/main/roc_auc', 'elapsed_time'
            ]))
    else:
        raise ValueError('Invalid accfun_mode {}'.format(eval_mode))
    trainer.extend(E.ProgressBar(update_interval=10))
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    config = {
        'method': args.method,
        'conv_layers': args.conv_layers,
        'unit_num': args.unit_num,
        'labels': args.label
    }

    with open(os.path.join(args.out, 'config.json'), 'w') as o:
        o.write(json.dumps(config))

    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Beispiel #21
0
 def test_invalid_label_key_type(self):
     with pytest.raises(TypeError):
         Classifier(links.Linear(10, 3), label_key=None)
Beispiel #22
0
def main():
    args = parse_arguments()
    generate_drug_list = True if args.generate_drug_list == 'True' else False

    if args.label:
        labels = args.label
        # class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label(label_list):
        label_arr = np.asarray(label_list, dtype=np.int32)
        return label_arr

    # Apply a preprocessor to the dataset.
    logging.info('Preprocess test dataset...')
    preprocessor = preprocess_method_dict['ggnn']()
    parser = CSVFileParserForPair(preprocessor,
                                  postprocess_label=postprocess_label,
                                  labels=labels,
                                  smiles_cols=['smiles_1', 'smiles_2'])
    test_dict = parser.parse(args.test_datafile,
                             return_smiles_pair_original=True)
    test = test_dict['dataset']
    # test_smiles_pairs = test_dict['smiles_pair_original']
    from chainer.iterators import SerialIterator
    test_iter = SerialIterator(test, 1, repeat=False, shuffle=False)

    out = 'output' + '/' + args.out
    model_path = os.path.join(out, args.model_filename)
    # `load_pickle` is static method, call from Class to get an instance
    print('model_path: {}'.format(model_path))
    model = Classifier.load_pickle(model_path, args.gpu)

    if args.gpu >= 0:
        model.to_gpu(args.gpu)
    else:
        model.to_cpu()

    snapshot_path = os.path.join(out, args.snapshot)
    try:
        chainer.serializers.load_npz(snapshot_path, model)
    except KeyError as e:
        print(e)

    evaluator = MyEvaluator(
        test_iter,
        model,
        converter=concat_mols,
        device=args.gpu,
        eval_func=model.predictor.predict,
        # mediate_func=models.predictor.mediate_output,
        name='test',
        ignore_labels=-1)
    e1_total, e2_total = evaluator.generate_representations()
    y_total, t_total = evaluator.generate_y_and_t()

    # print('test_datafile: {}'.format(args.test_datafile))
    test_filename = os.path.basename(args.test_datafile).split('.')[0]
    # print('test_filename: {}'.format(test_filename))
    dst_repre_filename = test_filename + '_e' + '.csv'
    dst_repre_filepath = os.path.join(out, dst_repre_filename)
    add_representations(args.test_datafile,
                        dst_repre_filepath,
                        e1_total,
                        e2_total,
                        generate_drug_list=generate_drug_list)

    dst_filename = test_filename + '_e_y' + '.csv'
    dst_filepath = os.path.join(out, dst_filename)
    add_reprensentations_and_y(args.test_datafile, dst_filepath, e1_total,
                               e2_total, y_total)

    perf_dict = dict()
    for metric in [
            'roc_auc', 'prc_auc', 'accuracy', 'precision', 'recall', 'f1'
    ]:
        result = evaluator.compuate(metric=metric)
        perf_dict[metric] = result
        print('{}: {}'.format(metric, result))
    with open(os.path.join(ROOT_PATH, 'eval_result.json'), 'w') as f:
        json.dump(perf_dict, f)

    def eval_func(atoms_1, adj_1, atoms_2, adj_2, label):
        sample = [
            (atoms_1, adj_1),
            (atoms_2, adj_2),
        ]
        sample = concat_mols(sample)
        atoms_1, adj_1 = sample[0]
        atoms_2, adj_2 = sample[1]
        print(atoms_1, adj_1)
        print('shape 1:', atoms_1.shape, adj_1.shape)
        print('shape 2:', atoms_2.shape, adj_2.shape)
        pred, _ = model.predictor.predict(atoms_1, adj_1, atoms_2, adj_2)
        return pred

    calculator = IntegratedGradientsCalculator(
        model.predictor,
        steps=5,
        eval_fun=eval_func,
        target_extractor=VariableMonitorLinkHook(
            model.predictor.graph_conv.embed, timing='post'),
        device=args.gpu)

    M = 1
    # 2. compute
    # saliency_samples_vanilla = calculator.compute(
    #     test, M=1, converter=concat_mols)
    # saliency_samples_smooth = calculator.compute(
    #     test, M=M, converter=concat_mols, noise_sampler=GaussianNoiseSampler())
    saliency_samples_bayes = calculator.compute(test,
                                                M=M,
                                                converter=concat_mols,
                                                train=True)

    visualizer = SmilesVisualizer()

    smiles = list(pd.read_csv(args.test_datafile, index_col=0)['smiles_2'])

    # from IPython.display import display, HTML

    def sv_visualize(i, ratio, method, view):
        # saliency_vanilla = calculator.aggregate(
        #     saliency_samples_vanilla, ch_axis=3, method=method)
        # saliency_smooth = calculator.aggregate(
        #     saliency_samples_smooth, ch_axis=3, method=method)
        saliency_bayes = calculator.aggregate(saliency_samples_bayes,
                                              ch_axis=3,
                                              method=method)

        scaler = abs_max_scaler
        if view == 'view':
            # svg_vanilla = visualizer.visualize(saliency_vanilla[i], smiles[i], visualize_ratio=ratio, scaler=scaler)
            # svg_smooth = visualizer.visualize(saliency_smooth[i], smiles[i], visualize_ratio=ratio, scaler=scaler)
            svg_bayes = visualizer.visualize(saliency_bayes[i],
                                             smiles[i],
                                             visualize_ratio=ratio,
                                             scaler=scaler)
            # display(svg_bayes)
        elif view == 'save':
            if not os.path.exists('results'):
                os.makedirs('results')
            # visualizer.visualize(saliency_vanilla[i], smiles[i], visualize_ratio=ratio, scaler=scaler,
            #                      save_filepath='results/{}_vanilla.png'.format(i))
            # visualizer.visualize(saliency_smooth[i], smiles[i], visualize_ratio=ratio, scaler=scaler,
            #                      save_filepath='results/{}_smooth.png'.format(i))
            visualizer.visualize(
                saliency_bayes[i],
                smiles[i],
                visualize_ratio=ratio,
                scaler=scaler,
                save_filepath='results/{}_bayes.svg'.format(i))
            print('saved {}-th result!'.format(i))
        else:
            print(view, 'not supported')

    sv_visualize(i=2, ratio=0.7, method='raw', view='save')
Beispiel #23
0
 def test_predict_cpu(self):
     clf = Classifier(self.predictor)
     actual_t = clf.predict(self.x)
     assert actual_t.shape == (3, )
     assert actual_t.dtype == numpy.int32
     assert numpy.alltrue(actual_t == self.t)
Beispiel #24
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers

    task_type = molnet_default_config[dataset_name]['task_type']
    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }

    print('Using dataset: {}...'.format(dataset_name))

    # Set up some useful variables that will be used later on.
    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Load the train and validation parts of the dataset.
    filenames = [
        dataset_part_filename(p, num_data) for p in ['train', 'valid']
    ]

    paths = [os.path.join(cache_dir, f) for f in filenames]
    if all([os.path.exists(path) for path in paths]):
        dataset_parts = []
        for path in paths:
            print('Loading cached dataset from {}.'.format(path))
            dataset_parts.append(NumpyTupleDataset.load(path))
    else:
        dataset_parts = download_entire_dataset(dataset_name, num_data, labels,
                                                method, cache_dir)
    train, valid = dataset_parts[0], dataset_parts[1]

    # Scale the label values, if necessary.
    scaler = None
    if args.scale == 'standardize':
        if task_type == 'regression':
            print('Applying standard scaling to the labels.')
            scaler = fit_scaler(dataset_parts)
        else:
            print('Label scaling is not available for classification tasks.')
    else:
        print('No label scaling was selected.')

    # Set up the predictor.
    predictor = set_up_predictor(method,
                                 n_unit,
                                 conv_layers,
                                 class_num,
                                 label_scaler=scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid,
                                          args.batchsize,
                                          repeat=False,
                                          shuffle=False)

    # Load metrics for the current dataset.
    metrics = molnet_default_config[dataset_name]['metrics']
    metrics_fun = {
        k: v
        for k, v in metrics.items() if isinstance(v, types.FunctionType)
    }
    loss_fun = molnet_default_config[dataset_name]['loss']

    device = chainer.get_device(args.device)
    if task_type == 'regression':
        model = Regressor(predictor,
                          lossfun=loss_fun,
                          metrics_fun=metrics_fun,
                          device=device)
    elif task_type == 'classification':
        model = Classifier(predictor,
                           lossfun=loss_fun,
                           metrics_fun=metrics_fun,
                           device=device)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(model)

    # Save model-related output to this directory.
    model_dir = os.path.join(args.out, os.path.basename(cache_dir))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=device,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir)
    trainer.extend(
        E.Evaluator(valid_iter, model, device=device, converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())

    # TODO: consider go/no-go of the following block
    # # (i) more reporting for val/evalutaion
    # # (ii) best validation score snapshot
    # if task_type == 'regression':
    #     metric_name_list = list(metrics.keys())
    #     if 'RMSE' in metric_name_list:
    #         trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]),
    #                        trigger=training.triggers.MinValueTrigger('validation/main/RMSE'))
    #     elif 'MAE' in metric_name_list:
    #         trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]),
    #                        trigger=training.triggers.MinValueTrigger('validation/main/MAE'))
    #     else:
    #         print("[WARNING] No validation metric defined?")
    #
    # elif task_type == 'classification':
    #     train_eval_iter = iterators.SerialIterator(
    #         train, args.batchsize, repeat=False, shuffle=False)
    #     trainer.extend(ROCAUCEvaluator(
    #         train_eval_iter, predictor, eval_func=predictor,
    #         device=args.gpu, converter=concat_mols, name='train',
    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))
    #     # extension name='validation' is already used by `Evaluator`,
    #     # instead extension name `val` is used.
    #     trainer.extend(ROCAUCEvaluator(
    #         valid_iter, predictor, eval_func=predictor,
    #         device=args.gpu, converter=concat_mols, name='val',
    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))
    #
    #     trainer.extend(E.snapshot_object(
    #         model, "best_val_" + model_filename[task_type]),
    #         trigger=training.triggers.MaxValueTrigger('val/main/roc_auc'))
    # else:
    #     raise NotImplementedError(
    #         'Not implemented task_type = {}'.format(task_type))

    trainer.extend(AutoPrintReport())
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the model's parameters.
    model_path = os.path.join(model_dir, model_filename[task_type])
    print('Saving the trained model to {}...'.format(model_path))
    model.save_pickle(model_path, protocol=args.protocol)
Beispiel #25
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    args.out = os.path.join(args.out, args.method)
    save_args(args, args.out)

    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label_float(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)
    def postprocess_label_int(label_list):
        return numpy.asarray(label_list, dtype=numpy.int64)

    # Apply a preprocessor to the dataset.
    if args.train:
    ## training data
        fn,ext = os.path.splitext(args.train)
        if ext==".npz":
            print('Loading training dataset...')
            train = NumpyTupleDataset.load(args.train)
        else:
            print('Preprocessing training dataset...')
            preprocessor = preprocess_method_dict[args.method]()
            if args.classification:
                parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES')
            else:
                parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES')
            train = parser.parse(args.train)['dataset']
            NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), train)        
        # Scale the label values, if necessary.
        if args.scale == 'standardize':
            scaler = StandardScaler()
            scaler.fit(train.get_datasets()[-1])
        else:
            scaler = None

    ## test data
    fn,ext = os.path.splitext(args.val)
    if ext==".npz":
        print('Loading test dataset...')
        test = NumpyTupleDataset.load(args.val)
    else:
        print('Preprocessing test dataset...')
        preprocessor = preprocess_method_dict[args.method]()
        if args.classification:
            parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES')
        else:
            parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES')
        test = parser.parse(args.val)['dataset']
        NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), test)


    # Set up the model.
    device = chainer.get_device(args.device)
    converter = converter_method_dict[args.method]
    metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}
    if args.classification:
        if args.load_model:
            model = Classifier.load_pickle(args.load_model, device=device)
            print("model file loaded: ",args.load_model)
        else:
            predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num)
            model = Classifier(predictor,
                                    lossfun=F.sigmoid_cross_entropy,
                                    metrics_fun=F.binary_accuracy,
                                    device=device)
    else:
        if args.load_model:
            model = Regressor.load_pickle(args.load_model, device=device)
            print("model file loaded: ",args.load_model)
        else:
            predictor = set_up_predictor(
                args.method+args.method_suffix, args.unit_num,
                args.conv_layers, class_num, label_scaler=scaler)
            model = Regressor(predictor, lossfun=F.mean_squared_error,
                            metrics_fun=metrics_fun, device=device)

    if args.train:
        if args.balanced_iter:
            train = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1)
            train.show_label_stats()
            
        print('Training...')
        log_keys = ['main/mae','main/rmse','validation/main/mae','validation/main/rmse','validation/main/roc_auc']
        extensions_list = [extensions.PlotReport(log_keys, 'iteration', trigger=(100, 'iteration'), file_name='loss.png')]
        if args.eval_roc and args.classification:
            extensions_list.append(ROCAUCEvaluator(
                        test, model, eval_func=predictor,
                        device=device, converter=converter, name='validation',
                        pos_labels=1, ignore_labels=-1, raise_value_error=False))

        save_json(os.path.join(args.out, 'args.json'), vars(args))
        run_train(model, train, valid=test,
                batch_size=args.batchsize, epoch=args.epoch,
                out=args.out, extensions_list=extensions_list,
                device=device, converter=converter) #, resume_path=args.resume)

        # Save the model's parameters.
        model_path = os.path.join(args.out, args.model_filename)
        print('Saving the trained model to {}...'.format(model_path))
        if hasattr(model.predictor.graph_conv, 'reset_state'):
            model.predictor.graph_conv.reset_state()
        model.save_pickle(model_path, protocol=args.protocol)

    ## prediction
    it = SerialIterator(test, args.batchsize, repeat=False, shuffle=False)
    result = []
    for batch in it:
        in_arrays = convert._call_converter(converter, batch, device)
        with chainer.using_config('train', False), chainer.function.no_backprop_mode():
            if isinstance(in_arrays, tuple):
                res = model(*in_arrays)
            elif isinstance(in_arrays, dict):
                res = model(**in_arrays)
            else:
                res = model(in_arrays)
        result.extend(model.y.array.get())

    numpy.savetxt(os.path.join(args.out,"result.csv"), numpy.array(result))

    eval_result = Evaluator(it, model, converter=converter,device=device)()
    print('Evaluation result: ', eval_result)