Beispiel #1
0
def _test_roc_auc_evaluator_with_labels(data1):
    """test `pos_labels` and `ignore_labels` behavior"""

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data1)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = ROCAUCEvaluator(
        iterator, predictor, name='val',
        pos_labels=[1, 2], ignore_labels=-1,
    )

    # --- test evaluate ---
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    expected_roc_auc = 0.75
    # print('observation ', observation)
    assert observation['target/roc_auc'] == expected_roc_auc

    # --- test __call__ ---
    result = evaluator()
    # print('result ', result)
    assert result['val/main/roc_auc'] == expected_roc_auc
Beispiel #2
0
def _test_roc_auc_evaluator_raise_error(data, raise_value_error=True):

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = ROCAUCEvaluator(iterator,
                                predictor,
                                name='train',
                                pos_labels=1,
                                ignore_labels=None,
                                raise_value_error=raise_value_error)
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    return observation['target/roc_auc']
Beispiel #3
0
def _test_roc_auc_evaluator_default_args(data0):

    predictor = DummyPredictor()
    dataset = NumpyTupleDataset(*data0)

    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)
    evaluator = ROCAUCEvaluator(
        iterator, predictor, name='train',
        pos_labels=1, ignore_labels=None
    )
    repo = chainer.Reporter()
    repo.add_observer('target', predictor)
    with repo:
        observation = evaluator.evaluate()

    expected_roc_auc = 0.75
    # print('observation ', observation)
    assert observation['target/roc_auc'] == expected_roc_auc

    # --- test __call__ ---
    result = evaluator()
    # print('result ', result)
    assert result['train/main/roc_auc'] == expected_roc_auc
Beispiel #4
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)

    # Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Re-load the best-validation score snapshot
    # serializers.load_npz(os.path.join(
    #     model_dir, "best_val_" + model_filename[task_type]), model)

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Add more stats
    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result)
    else:
        print('[WARNING] unknown task_type {}.'.format(task_type))

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']

    _, test, _ = data.load_dataset(method, labels, num_data=args.num_data)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    clf = Classifier.load_pickle(os.path.join(args.in_dir,
                                              args.model_filename),
                                 device=args.gpu)  # type: Classifier

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)

    y_pred = clf.predict(test,
                         converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test,
                                converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'.format(
        y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'.format('TaskID', 'Correct', 'Total',
                                           'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'.format(idx, n_correct, n_total,
                                                      accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            clf,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(test_iterator,
                                    clf,
                                    converter=concat_mols,
                                    device=args.gpu,
                                    eval_func=clf.predictor,
                                    name='test',
                                    ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(rocauc_result, f)
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)


#    # Load the standard scaler parameters, if necessary.
#    if args.scale == 'standardize':
#        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
#        print('Loading scaler parameters from {}.'.format(scaler_path))
#        with open(scaler_path, mode='rb') as f:
#            scaler = pickle.load(f)
#    else:
#        print('No standard scaling was selected.')
#        scaler = None

# Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Proposed by Ishiguro
    # ToDo: consider go/no-go with following modification
    # Re-load the best-validation score snapshot
    serializers.load_npz(
        os.path.join(model_dir, "best_val_" + model_filename[task_type]),
        model)

    #    # Replace the default predictor with one that scales the output labels.
    #    scaled_predictor = ScaledGraphConvPredictor(model.predictor)
    #    scaled_predictor.scaler = scaler
    #    model.predictor = scaled_predictor

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Proposed by Ishiguro: add more stats
    # ToDo: considre go/no-go with the following modification

    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

        save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)
    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(args.in_dir, 'eval_result.json'), rocauc_result)
    else:
        pass

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
Beispiel #7
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'nfpdrop', 'ggnndrop']
    label_names = D.get_tox21_label_names() + ['pyridine']
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--n-layers',
                        type=int,
                        default=1,
                        help='number of mlp layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--dropout-ratio',
                        '-d',
                        type=float,
                        default=0.25,
                        help='dropout_ratio')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument('--num-train',
                        type=int,
                        default=-1,
                        help='number of training data to be used, '
                        'negative value indicates use all train data')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset(
        method, labels)

    num_train = args.num_train  # 100
    if num_train > 0:
        # reduce size of train data
        seed = args.seed  # 0
        np.random.seed(seed)
        train_selected_label = np.random.permutation(np.arange(
            len(train)))[:num_train]
        print('num_train', num_train, len(train_selected_label),
              train_selected_label)
        train = NumpyTupleDataset(*train.features[train_selected_label, :])
    # Network
    predictor_ = predictor.build_predictor(method, args.unit_num,
                                           args.conv_layers, class_num,
                                           args.dropout_ratio, args.n_layers)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)
    classifier = L.Classifier(predictor_,
                              lossfun=F.sigmoid_cross_entropy,
                              accfun=F.binary_accuracy)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.LogReport())

    # --- ROCAUC Evaluator ---
    train_eval_iter = I.SerialIterator(train,
                                       args.batchsize,
                                       repeat=False,
                                       shuffle=False)
    trainer.extend(
        ROCAUCEvaluator(train_eval_iter,
                        classifier,
                        eval_func=predictor_,
                        device=args.gpu,
                        converter=concat_mols,
                        name='train'))
    trainer.extend(
        ROCAUCEvaluator(val_iter,
                        classifier,
                        eval_func=predictor_,
                        device=args.gpu,
                        converter=concat_mols,
                        name='val'))
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
            'validation/main/loss', 'validation/main/accuracy',
            'val/main/roc_auc', 'elapsed_time'
        ]))

    trainer.extend(E.ProgressBar(update_interval=10))
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    with open(os.path.join(args.out, 'args.json'), 'w') as f:
        json.dump(vars(args), f, indent=4)
    chainer.serializers.save_npz(os.path.join(args.out, 'predictor.npz'),
                                 predictor_)
Beispiel #8
0
def main():
    label_names = D.get_tox21_label_names()

    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--trainer-snapshot',
                        '-s',
                        type=str,
                        default='',
                        help='Path to the snapshot file of the Chainer '
                        'trainer from which serialized model parameters '
                        'are extracted. If it is not specified, this '
                        'script searches the training result directory '
                        'for the latest snapshot, assuming that '
                        'the naming convension of snapshot files is '
                        '`snapshot_iter_N` where N is the number of '
                        'iterations, which is the default configuration '
                        'of Chainer.')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']
    if labels:
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        class_num = len(label_names)

    _, test, _ = data.load_dataset(method, labels)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    predictor_ = predictor.build_predictor(method, config['unit_num'],
                                           config['conv_layers'], class_num)
    snapshot_file = args.trainer_snapshot
    if not snapshot_file:
        snapshot_file = _find_latest_snapshot(args.in_dir)
    print('Loading pretrained model parameters from {}'.format(snapshot_file))
    chainer.serializers.load_npz(snapshot_file, predictor_,
                                 'updater/model:main/predictor/')

    clf = Classifier(predictor=predictor_,
                     device=args.gpu,
                     lossfun=F.sigmoid_cross_entropy,
                     metrics_fun=F.binary_accuracy)

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)

    y_pred = clf.predict(test,
                         converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test,
                                converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'.format(
        y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'.format('TaskID', 'Correct', 'Total',
                                           'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'.format(idx, n_correct, n_total,
                                                      accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            clf,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(test_iterator,
                                    clf,
                                    converter=concat_mols,
                                    device=args.gpu,
                                    eval_func=predictor_,
                                    name='test',
                                    ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)