def main():
    # Parse the arguments.
    args = parse_arguments()

    if args.label:
        labels = args.label
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES')
    dataset = parser.parse(args.datafile)['dataset']

    test = dataset

    print('Predicting...')
    # Set up the regressor.
    device = chainer.get_device(args.device)
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=device)

    # Perform the prediction.
    print('Evaluating...')
    converter = converter_method_dict[args.method]
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=converter,
                            device=device)()
    print('Evaluation result: ', eval_result)

    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)
Exemple #2
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        scaled_t = scaler.transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t, )))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    original_t = scaler.inverse_transform(t)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): original_t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format(
            label_name, y_pred[:n_eval, target_label],
            original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers

    task_type = molnet_default_config[dataset_name]['task_type']
    model_filename = {'classification': 'classifier.pkl',
                      'regression': 'regressor.pkl'}

    print('Using dataset: {}...'.format(dataset_name))

    # Set up some useful variables that will be used later on.
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name,
                                                            method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name,
                                                             method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Load the train and validation parts of the dataset.
    filenames = [dataset_part_filename(p, num_data)
                 for p in ['train', 'valid']]

    paths = [os.path.join(cache_dir, f) for f in filenames]
    if all([os.path.exists(path) for path in paths]):
        dataset_parts = []
        for path in paths:
            print('Loading cached dataset from {}.'.format(path))
            dataset_parts.append(NumpyTupleDataset.load(path))
    else:
        dataset_parts = download_entire_dataset(dataset_name, num_data, labels,
                                                method, cache_dir)
    train, valid = dataset_parts[0], dataset_parts[1]

    # Scale the label values, if necessary.
    scaler = None
    if args.scale == 'standardize':
        if task_type == 'regression':
            print('Applying standard scaling to the labels.')
            scaler = fit_scaler(dataset_parts)
        else:
            print('Label scaling is not available for classification tasks.')
    else:
        print('No label scaling was selected.')

    # Set up the predictor.
    predictor = set_up_predictor(method, n_unit, conv_layers, class_num,
                                 label_scaler=scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,
                                          shuffle=False)

    # Load metrics for the current dataset.
    metrics = molnet_default_config[dataset_name]['metrics']
    metrics_fun = {k: v for k, v in metrics.items()
                   if isinstance(v, types.FunctionType)}
    loss_fun = molnet_default_config[dataset_name]['loss']

    device = chainer.get_device(args.device)
    if task_type == 'regression':
        model = Regressor(predictor, lossfun=loss_fun,
                          metrics_fun=metrics_fun, device=device)
    elif task_type == 'classification':
        model = Classifier(predictor, lossfun=loss_fun,
                           metrics_fun=metrics_fun, device=device)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(model)

    # Save model-related output to this directory.
    if not os.path.exists(args.out):
        os.makedirs(args.out)
    save_json(os.path.join(args.out, 'args.json'), vars(args))
    model_dir = os.path.join(args.out, os.path.basename(cache_dir))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Set up the updater.
    converter = converter_method_dict[method]
    updater = training.StandardUpdater(train_iter, optimizer, device=device,
                                       converter=converter)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir)
    trainer.extend(E.Evaluator(valid_iter, model, device=device,
                               converter=converter))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())

    # TODO: consider go/no-go of the following block
    # # (i) more reporting for val/evalutaion
    # # (ii) best validation score snapshot
    # if task_type == 'regression':
    #     metric_name_list = list(metrics.keys())
    #     if 'RMSE' in metric_name_list:
    #         trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]),
    #                        trigger=training.triggers.MinValueTrigger('validation/main/RMSE'))
    #     elif 'MAE' in metric_name_list:
    #         trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]),
    #                        trigger=training.triggers.MinValueTrigger('validation/main/MAE'))
    #     else:
    #         print("[WARNING] No validation metric defined?")
    #
    # elif task_type == 'classification':
    #     train_eval_iter = iterators.SerialIterator(
    #         train, args.batchsize, repeat=False, shuffle=False)
    #     trainer.extend(ROCAUCEvaluator(
    #         train_eval_iter, predictor, eval_func=predictor,
    #         device=args.gpu, converter=concat_mols, name='train',
    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))
    #     # extension name='validation' is already used by `Evaluator`,
    #     # instead extension name `val` is used.
    #     trainer.extend(ROCAUCEvaluator(
    #         valid_iter, predictor, eval_func=predictor,
    #         device=args.gpu, converter=concat_mols, name='val',
    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))
    #
    #     trainer.extend(E.snapshot_object(
    #         model, "best_val_" + model_filename[task_type]),
    #         trigger=training.triggers.MaxValueTrigger('val/main/roc_auc'))
    # else:
    #     raise NotImplementedError(
    #         'Not implemented task_type = {}'.format(task_type))

    trainer.extend(AutoPrintReport())
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the model's parameters.
    model_path = os.path.join(model_dir,  model_filename[task_type])
    print('Saving the trained model to {}...'.format(model_path))
    model.save_pickle(model_path, protocol=args.protocol)
Exemple #4
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)

    # Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Re-load the best-validation score snapshot
    # serializers.load_npz(os.path.join(
    #     model_dir, "best_val_" + model_filename[task_type]), model)

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Add more stats
    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result)
    else:
        print('[WARNING] unknown task_type {}.'.format(task_type))

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
Exemple #5
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    device = args.gpu

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=device)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        original_t = dataset.get_datasets()[-1]
        if args.gpu >= 0:
            scaled_t = cuda.to_cpu(scaler.transform(
                cuda.to_gpu(original_t)))
        else:
            scaled_t = scaler.transform(original_t)

        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t,)))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(
        test, converter=extract_inputs,
        postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=device)[-1]
    original_t = cuda.to_cpu(scaler.inverse_transform(t))

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],
                        't_{}'.format(l): original_t[:, i], })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'
              .format(label_name, y_pred[:n_eval, target_label],
                      original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator, regressor, converter=concat_mols,
                            device=device)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)


#    # Load the standard scaler parameters, if necessary.
#    if args.scale == 'standardize':
#        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
#        print('Loading scaler parameters from {}.'.format(scaler_path))
#        with open(scaler_path, mode='rb') as f:
#            scaler = pickle.load(f)
#    else:
#        print('No standard scaling was selected.')
#        scaler = None

# Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Proposed by Ishiguro
    # ToDo: consider go/no-go with following modification
    # Re-load the best-validation score snapshot
    serializers.load_npz(
        os.path.join(model_dir, "best_val_" + model_filename[task_type]),
        model)

    #    # Replace the default predictor with one that scales the output labels.
    #    scaled_predictor = ScaledGraphConvPredictor(model.predictor)
    #    scaled_predictor.scaler = scaler
    #    model.predictor = scaled_predictor

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Proposed by Ishiguro: add more stats
    # ToDo: considre go/no-go with the following modification

    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

        save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)
    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(args.in_dir, 'eval_result.json'), rocauc_result)
    else:
        pass

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
def main():
    # Supported preprocessing/network list
    method_list = [
        'nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat'
    ]
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--eval-mode',
                        type=int,
                        default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument(
        '--device',
        type=str,
        default='-1',
        help='Device specifier. Either ChainerX device specifier or an '
        'integer. If non-negative integer, CuPy arrays with specified '
        'device id are used. If negative integer, NumPy arrays are used')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol',
                        type=int,
                        default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)

    # Network
    predictor_ = set_up_predictor(method, args.unit_num, args.conv_layers,
                                  class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))

    device = chainer.get_device(args.device)
    classifier = Classifier(predictor_,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=device)

    extensions_list = []
    eval_mode = args.eval_mode
    if eval_mode == 1:
        train_eval_iter = I.SerialIterator(train,
                                           args.batchsize,
                                           repeat=False,
                                           shuffle=False)

        extensions_list.append(
            ROCAUCEvaluator(train_eval_iter,
                            classifier,
                            eval_func=predictor_,
                            device=device,
                            converter=concat_mols,
                            name='train',
                            pos_labels=1,
                            ignore_labels=-1,
                            raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        val_iter = I.SerialIterator(val,
                                    args.batchsize,
                                    repeat=False,
                                    shuffle=False)
        extensions_list.append(
            ROCAUCEvaluator(val_iter,
                            classifier,
                            eval_func=predictor_,
                            device=device,
                            converter=concat_mols,
                            name='val',
                            pos_labels=1,
                            ignore_labels=-1))

    run_train(classifier,
              train_iter,
              valid=val,
              batch_size=args.batchsize,
              epoch=args.epoch,
              out=args.out,
              device=device,
              converter=concat_mols,
              extensions_list=extensions_list,
              resume_path=args.resume)

    # frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    # trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))
    # trainer.run()

    config = {
        'method': args.method,
        'conv_layers': args.conv_layers,
        'unit_num': args.unit_num,
        'labels': args.label
    }
    save_json(os.path.join(args.out, 'config.json'), config)

    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Exemple #8
0
    def main():
        # Parse the arguments.
        args = parse_arguments()
        theme_name = t_theme_name.get()

        args.model_folder_name = os.path.join(theme_name, 'chainer')
        #args.epoch = int(float(t_epochs.get()))
        args.out = parent_path / 'models' / theme_name / method_name
        args.method = method_name

        if args.label:
            labels = args.label
        else:
            raise ValueError('No target label was specified.')

        # Dataset preparation.
        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)

        smiles_col_name = t_smiles.get()
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[args.method]()
        parser = CSVFileParser(preprocessor,
                               postprocess_label=postprocess_label,
                               labels=labels,
                               smiles_col=t_smiles.get())

        #args.datafile=parent_path / 'results' /  theme_name / method_name / high_low /'brics_virtual'  / 'virtual.csv'
        args.datafile = csv_path
        dataset = parser.parse(args.datafile)['dataset']

        @chainer.dataset.converter()
        def extract_inputs(batch, device=None):
            return concat_mols(batch, device=device)[:-1]

        print('Predicting the virtual library')
        # Set up the regressor.
        device = chainer.get_device(args.device)
        model_path = os.path.join(args.out, args.model_foldername,
                                  args.model_filename)

        with open(
                parent_path / 'models' / theme_name / method_name / high_low /
            ('regressor.pickle'), 'rb') as f:
            regressor = cloudpickle.loads(f.read())

        # Perform the prediction.
        print('Evaluating...')
        converter = converter_method_dict[args.method]
        data_iterator = SerialIterator(dataset,
                                       16,
                                       repeat=False,
                                       shuffle=False)
        eval_result = Evaluator(data_iterator,
                                regressor,
                                converter=converter,
                                device=device)()
        print('Evaluation result: ', eval_result)

        predict_ = regressor.predict(dataset, converter=extract_inputs)
        predict_ = [i[0] for i in predict_]
        df_data = pd.read_csv(csv_path)

        df_predict = df_data
        df_predict[t_task.get()] = predict_
        df_predict = df_predict.dropna()

        PandasTools.AddMoleculeColumnToFrame(frame=df_predict,
                                             smilesCol=t_smiles.get())
        df_predict['sascore'] = df_predict.ROMol.map(sascorer.calculateScore)

        df_predict.to_csv(csv_path)

        png_generator = (parent_path / 'results' / theme_name / method_name /
                         high_low / data_name /
                         'molecular-structure').glob('*.png')
        #png_generator.sort()

        for i, png_path in enumerate(png_generator):
            #print((png_path.name)[4:10])
            i = int((png_path.name)[4:10])
            if i < len(df_predict[t_task.get()]):
                img = Image.open(png_path)
                draw = ImageDraw.Draw(img)
                font = ImageFont.truetype('arial.ttf', 26)
                draw.text((0, 0),
                          t_task.get() + ' : ' +
                          str(round(df_predict[t_task.get()][i], 2)),
                          (0, 0, 0),
                          font=font)
                draw.text(
                    (0, 30),
                    'sascore : ' + str(round(df_predict['sascore'][i], 2)),
                    (0, 0, 0),
                    font=font)

                img.save(png_path)

        save_json(os.path.join(args.out, 'eval_result.json'), eval_result)
Exemple #9
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    args.out = os.path.join(args.out, args.method)
    save_args(args, args.out)

    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label_float(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)
    def postprocess_label_int(label_list):
        return numpy.asarray(label_list, dtype=numpy.int64)

    # Apply a preprocessor to the dataset.
    if args.train:
    ## training data
        fn,ext = os.path.splitext(args.train)
        if ext==".npz":
            print('Loading training dataset...')
            train = NumpyTupleDataset.load(args.train)
        else:
            print('Preprocessing training dataset...')
            preprocessor = preprocess_method_dict[args.method]()
            if args.classification:
                parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES')
            else:
                parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES')
            train = parser.parse(args.train)['dataset']
            NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), train)        
        # Scale the label values, if necessary.
        if args.scale == 'standardize':
            scaler = StandardScaler()
            scaler.fit(train.get_datasets()[-1])
        else:
            scaler = None

    ## test data
    fn,ext = os.path.splitext(args.val)
    if ext==".npz":
        print('Loading test dataset...')
        test = NumpyTupleDataset.load(args.val)
    else:
        print('Preprocessing test dataset...')
        preprocessor = preprocess_method_dict[args.method]()
        if args.classification:
            parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES')
        else:
            parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES')
        test = parser.parse(args.val)['dataset']
        NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), test)


    # Set up the model.
    device = chainer.get_device(args.device)
    converter = converter_method_dict[args.method]
    metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}
    if args.classification:
        if args.load_model:
            model = Classifier.load_pickle(args.load_model, device=device)
            print("model file loaded: ",args.load_model)
        else:
            predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num)
            model = Classifier(predictor,
                                    lossfun=F.sigmoid_cross_entropy,
                                    metrics_fun=F.binary_accuracy,
                                    device=device)
    else:
        if args.load_model:
            model = Regressor.load_pickle(args.load_model, device=device)
            print("model file loaded: ",args.load_model)
        else:
            predictor = set_up_predictor(
                args.method+args.method_suffix, args.unit_num,
                args.conv_layers, class_num, label_scaler=scaler)
            model = Regressor(predictor, lossfun=F.mean_squared_error,
                            metrics_fun=metrics_fun, device=device)

    if args.train:
        if args.balanced_iter:
            train = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1)
            train.show_label_stats()
            
        print('Training...')
        log_keys = ['main/mae','main/rmse','validation/main/mae','validation/main/rmse','validation/main/roc_auc']
        extensions_list = [extensions.PlotReport(log_keys, 'iteration', trigger=(100, 'iteration'), file_name='loss.png')]
        if args.eval_roc and args.classification:
            extensions_list.append(ROCAUCEvaluator(
                        test, model, eval_func=predictor,
                        device=device, converter=converter, name='validation',
                        pos_labels=1, ignore_labels=-1, raise_value_error=False))

        save_json(os.path.join(args.out, 'args.json'), vars(args))
        run_train(model, train, valid=test,
                batch_size=args.batchsize, epoch=args.epoch,
                out=args.out, extensions_list=extensions_list,
                device=device, converter=converter) #, resume_path=args.resume)

        # Save the model's parameters.
        model_path = os.path.join(args.out, args.model_filename)
        print('Saving the trained model to {}...'.format(model_path))
        if hasattr(model.predictor.graph_conv, 'reset_state'):
            model.predictor.graph_conv.reset_state()
        model.save_pickle(model_path, protocol=args.protocol)

    ## prediction
    it = SerialIterator(test, args.batchsize, repeat=False, shuffle=False)
    result = []
    for batch in it:
        in_arrays = convert._call_converter(converter, batch, device)
        with chainer.using_config('train', False), chainer.function.no_backprop_mode():
            if isinstance(in_arrays, tuple):
                res = model(*in_arrays)
            elif isinstance(in_arrays, dict):
                res = model(**in_arrays)
            else:
                res = model(in_arrays)
        result.extend(model.y.array.get())

    numpy.savetxt(os.path.join(args.out,"result.csv"), numpy.array(result))

    eval_result = Evaluator(it, model, converter=converter,device=device)()
    print('Evaluation result: ', eval_result)