Esempio n. 1
0
    def evaluate_model(y, y_est):
        mse = metrics.mean_squared_error(y, y_est)
        ae = np.average(metrics.abs_absolute_error(y, y_est))
        ave_y = np.average(y)
        ave_y_est = np.average(y_est)
        bias = (ave_y_est - ave_y)

        eval_results = OrderedDict()
        eval_results['MSE'] = mse
        eval_results['RMSE'] = np.sqrt(mse)
        eval_results['AE'] = ae
        eval_results['Max AAE'] = metrics.max_absolute_error(y, y_est)
        eval_results['Bias'] = bias

        eval_results['RRMSE'] = np.sqrt(mse) / np.abs(ave_y)
        eval_results['MARE'] = ae / np.abs(ave_y)
        eval_results['Max ARE'] = metrics.max_relative_error(y, y_est)
        eval_results['RBias'] = bias / np.abs(ave_y)

        eval_results['Accuracy1%'] = metrics.accuracy(y, y_est, 0.01)
        eval_results['Accuracy2%'] = metrics.accuracy(y, y_est, 0.02)
        eval_results['Accuracy5%'] = metrics.accuracy(y, y_est, 0.05)
        eval_results['Accuracy10%'] = metrics.accuracy(y, y_est, 0.1)

        return eval_results
Esempio n. 2
0
def pca_train(n, normed_trainx, trainy, normed_validx, validy, opt, logger,
              layers, opt_lr, opt_epochs, optimizer):
    trainx, validx, var_ex = pca_nd(normed_trainx, normed_validx, n, logger)
    validy_ = validy.copy()  # for further convenience
    trainy_ = trainy.copy()
    if opt.gpu:  # store everything to GPU all at once
        logger.info('Using GPU acceleration')
        device = torch.device("cuda:0")
        trainx = torch.Tensor(trainx).to(device)
        trainy = torch.Tensor(trainy).to(device)
        validx = torch.Tensor(validx).to(device)
        validy = torch.Tensor(validy).to(device)
    model = fitting.TorchMLPRegressor(len(trainx[0]),
                                      len(trainy[0]),
                                      layers,
                                      batch_size=opt.batch,
                                      is_gpu=opt.gpu != 0,
                                      args_opt={
                                          'optimizer': torch.optim.Adam,
                                          'lr': opt.lr,
                                          'weight_decay': opt.l2
                                      })
    model.init_session()
    print(model.regressor)
    model.load_data(trainx, trainy)

    header = 'Epoch Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc1% Acc2% Acc5% Acc10%'.split(
    )
    logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header))
    total_epoch = 0

    for k, each_epoch in enumerate(opt_epochs):
        # implement seperated learning rate
        model.reset_optimizer({
            'optimizer': optimizer,
            'lr': opt_lr[k],
            'weight_decay': opt.l2
        })
        for i_epoch in range(each_epoch):
            total_epoch += 1
            loss = model.fit_epoch(trainx, trainy)
            if (i_epoch + 1) % 20 == 0 or i_epoch + 1 == each_epoch:
                predy = model.predict_batch(validx)
                err_line = '%d/%d %8.3e %8.3e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % (
                    total_epoch, sum(opt_epochs), loss,
                    metrics.mean_squared_error(validy_, predy),
                    metrics.mean_signed_error(validy_, predy) * 100,
                    metrics.mean_unsigned_error(validy_, predy) * 100,
                    metrics.max_relative_error(validy_, predy) * 100,
                    metrics.accuracy(validy_, predy, 0.01) * 100,
                    metrics.accuracy(validy_, predy, 0.02) * 100,
                    metrics.accuracy(validy_, predy, 0.05) * 100,
                    metrics.accuracy(validy_, predy, 0.10) * 100)
                logger.info(err_line)

    return var_ex, metrics.accuracy(
        validy_, predy, 0.02) * 100, metrics.mean_squared_error(validy, predy)
Esempio n. 3
0
def main():
    logger.info('Reading data and extra features...')
    fp_files = [] if opt.fp is None else opt.fp.split(',')
    fp_extra, y_array, name_array = dataloader.load(opt.input, opt.target, fp_files)
    smiles_list = [name.split()[0] for name in name_array]

    logger.info('Generating molecular graphs with %s...' % opt.graph)
    if opt.graph == 'rdk':
        graph_list, feats_list = smi2dgl(smiles_list)
    elif opt.graph == 'msd':
        msd_list = ['%s.msd' % base64.b64encode(smiles.encode()).decode() for smiles in smiles_list]
        graph_list, feats_list = msd2dgl(msd_list, '../data/msdfiles.zip')
    else:
        raise

    logger.info('Node feature example: (size=%d) %s' % (len(feats_list[0][0]), ','.join(map(str, feats_list[0][0]))))
    logger.info('Extra graph feature example: (size=%d) %s' % (len(fp_extra[0]), ','.join(map(str, fp_extra[0]))))
    logger.info('Output example: (size=%d) %s' % (len(y_array[0]), ','.join(map(str, y_array[0]))))

    if fp_extra.shape[-1] > 0:
        logger.info('Normalizing extra graph features...')
        scaler = preprocessing.Scaler()
        scaler.fit(fp_extra)
        scaler.save(opt.output + '/scale.txt')
        fp_extra = scaler.transform(fp_extra)

    logger.info('Selecting data...')
    selector = preprocessing.Selector(smiles_list)
    if opt.part is not None:
        logger.info('Loading partition file %s' % opt.part)
        selector.load(opt.part)
    else:
        logger.warning('Partition file not provided. Using auto-partition instead')
        selector.partition(0.8, 0.2)

    device = torch.device('cuda:0')
    # batched data for training set
    data_list = [[data[i] for i in np.where(selector.train_index)[0]]
                 for data in (graph_list, y_array, feats_list, fp_extra, name_array, smiles_list)]
    n_batch, (graphs_batch, y_batch, feats_node_batch, feats_extra_batch, names_batch) = \
        preprocessing.separate_batches(data_list[:-1], opt.batch, data_list[-1])
    bg_batch_train = [dgl.batch(graphs).to(device) for graphs in graphs_batch]
    y_batch_train = [torch.tensor(y, dtype=torch.float32, device=device) for y in y_batch]
    feats_node_batch_train = [torch.tensor(np.concatenate(feats_node), dtype=torch.float32, device=device)
                              for feats_node in feats_node_batch]
    feats_extra_batch_train = [torch.tensor(feats_extra, dtype=torch.float32, device=device)
                               for feats_extra in feats_extra_batch]
    # for plot
    y_train_array = np.concatenate(y_batch)
    names_train = np.concatenate(names_batch)

    # data for validation set
    graphs, y, feats_node, feats_extra, names_valid = \
        [[data[i] for i in np.where(selector.valid_index)[0]]
         for data in (graph_list, y_array, feats_list, fp_extra, name_array)]
    bg_valid, y_valid, feats_node_valid, feats_extra_valid = (
        dgl.batch(graphs).to(device),
        torch.tensor(y, dtype=torch.float32, device=device),
        torch.tensor(np.concatenate(feats_node), dtype=torch.float32, device=device),
        torch.tensor(feats_extra, dtype=torch.float32, device=device),
    )
    # for plot
    y_valid_array = y_array[selector.valid_index]

    logger.info('Training size = %d, Validation size = %d' % (len(y_train_array), len(y_valid_array)))
    logger.info('Batches = %d, Batch size ~= %d' % (n_batch, opt.batch))

    in_feats_node = feats_list[0].shape[-1]
    in_feats_extra = fp_extra[0].shape[-1]
    n_heads = list(map(int, opt.head.split(',')))

    logger.info('Building network...')
    logger.info('Conv layers = %s' % n_heads)
    logger.info('Learning rate = %s' % opt.lr)
    logger.info('L2 penalty = %f' % opt.l2)

    model = GATModel(in_feats_node, opt.embed, n_head_list=n_heads, extra_feats=in_feats_extra)
    model.cuda()
    print(model)
    for name, param in model.named_parameters():
        print(name, param.data.shape)

    header = 'Step MaxRE(t) Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split()
    logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header))

    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.l2)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt.lrsteps, gamma=opt.lrgamma)
    for epoch in range(opt.epoch):
        model.train()
        if (epoch + 1) % opt.check == 0:
            pred_train = [None] * n_batch
        for ib in np.random.permutation(n_batch):
            optimizer.zero_grad()
            pred = model(bg_batch_train[ib], feats_node_batch_train[ib], feats_extra_batch_train[ib])
            loss = F.mse_loss(pred, y_batch_train[ib])
            loss.backward()
            optimizer.step()
            if (epoch + 1) % opt.check == 0:
                pred_train[ib] = pred.detach().cpu().numpy()
        scheduler.step()

        if (epoch + 1) % opt.check == 0:
            model.eval()
            pred_train = np.concatenate(pred_train)
            pred_valid = model(bg_valid, feats_node_valid, feats_extra_valid).detach().cpu().numpy()
            err_line = '%-8i %8.1f %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % (
                epoch + 1,
                metrics.max_relative_error(y_train_array, pred_train) * 100,
                metrics.mean_squared_error(y_train_array, pred_train),
                metrics.mean_squared_error(y_valid_array, pred_valid),
                metrics.mean_signed_error(y_valid_array, pred_valid) * 100,
                metrics.mean_unsigned_error(y_valid_array, pred_valid) * 100,
                metrics.max_relative_error(y_valid_array, pred_valid) * 100,
                metrics.accuracy(y_valid_array, pred_valid, 0.02) * 100,
                metrics.accuracy(y_valid_array, pred_valid, 0.05) * 100,
                metrics.accuracy(y_valid_array, pred_valid, 0.10) * 100)

            logger.info(err_line)
    torch.save(model, opt.output + '/model.pt')

    visualizer = visualize.LinearVisualizer(y_train_array.reshape(-1), pred_train.reshape(-1), names_train, 'train')
    visualizer.append(y_valid_array.reshape(-1), pred_valid.reshape(-1), names_valid, 'valid')
    visualizer.dump(opt.output + '/fit.txt')
    visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt', 'valid', threshold=0.1)
    visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt', 'valid', threshold=0.2)
    visualizer.scatter_yy(savefig=opt.output + '/error-train.png', annotate_threshold=0.1, marker='x', lw=0.2, s=5)
    visualizer.hist_error(savefig=opt.output + '/error-hist.png', label='valid', histtype='step', bins=50)
    plt.show()
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Alkane property fitting demo')
    parser.add_argument('-i', '--input', type=str, help='Data')
    parser.add_argument('-f', '--fp', type=str, help='Fingerprints')
    parser.add_argument('-o',
                        '--output',
                        default='out',
                        type=str,
                        help='Output directory')
    parser.add_argument('-t',
                        '--target',
                        default='raw_density',
                        type=str,
                        help='Fitting target')
    parser.add_argument('-p',
                        '--part',
                        default='',
                        type=str,
                        help='Partition cache file')
    parser.add_argument('-l',
                        '--layer',
                        default='16,16',
                        type=str,
                        help='Size of hidden layers')
    parser.add_argument('--visual',
                        default=1,
                        type=int,
                        help='Visualzation data')
    parser.add_argument('--gpu', default=1, type=int, help='Using gpu')
    parser.add_argument('--epoch',
                        default="500,2000,2500",
                        type=str,
                        help='Number of epochs')
    parser.add_argument('--batch', default=1000, type=int, help='Batch size')
    parser.add_argument('--lr',
                        default="0.01,0.001,0.0001",
                        type=str,
                        help='Initial learning rate')
    parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty')
    parser.add_argument('--check',
                        default=50,
                        type=int,
                        help='Number of epoch that do convergence check')
    parser.add_argument('--minstop',
                        default=0.2,
                        type=float,
                        help='Minimum fraction of step to stop')
    parser.add_argument('--maxconv',
                        default=2,
                        type=int,
                        help='Times of true convergence that makes a stop')
    parser.add_argument('--featrm',
                        default='',
                        type=str,
                        help='Remove features')
    parser.add_argument('--optim', default='rms', type=str, help='optimizer')
    parser.add_argument('--continuation',
                        default=False,
                        type=bool,
                        help='continue training')
    parser.add_argument('--pca',
                        default=-1,
                        type=int,
                        help='dimension to discard')
    parser.add_argument(
        '--sobol',
        default=-1,
        type=int,
        help='dimensions to reduce according to sensitivity analysis')

    opt = parser.parse_args()

    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    opt_lr = list(map(float, opt.lr.split(',')))
    opt_epochs = list(map(int, opt.epoch.split(',')))

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    logger = logging.getLogger('train')
    logger.setLevel(logging.INFO)
    flog = logging.FileHandler(opt.output + '/log.txt', mode='w')
    flog.setLevel(logging.INFO)
    formatter = logging.Formatter(
        fmt='[%(asctime)s] (%(levelname)s) %(message)s',
        datefmt='%Y-%d-%m %H:%M:%S')
    flog.setFormatter(formatter)
    clog = logging.StreamHandler()
    clog.setFormatter(formatter)
    logger.addHandler(flog)
    logger.addHandler(clog)

    if sys.platform == 'linux':
        logger.info('Use non-interactive Agg backend for matplotlib on linux')
        matplotlib.use('Agg')

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))
    logger.info('Remove Feature: %s' % featrm)

    logger.info('Reading data...')
    datax, datay, data_names = dataloader.load(filename=opt.input,
                                               target=opt.target,
                                               fps=opt.fp.split(','),
                                               featrm=featrm)

    # Store fingerprint identifier files
    for fp in opt.fp.split(','):
        if os.path.exists(fp + '.idx') and Path(fp).parent.absolute() != Path(
                opt.output).absolute():
            shutil.copy(fp + '.idx', opt.output)

    logger.info('Selecting data...')
    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        logger.info('Loading partition file %s' % opt.part)
        selector.load(opt.part)
    else:
        logger.warning(
            "Partition file not found. Using auto-partition instead.")
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()

    logger.info('Training size = %d, Validation size = %d' %
                (len(trainx), len(validx)))
    logger.info('X input example: (size=%d) %s' %
                (len(datax[0]), ','.join(map(str, datax[0]))))
    logger.info('Y input example: (size=%d) %s' %
                (len(datay[0]), ','.join(map(str, datay[0]))))
    logger.info('Normalizing...')
    scaler = preprocessing.Scaler()
    scaler.fit(trainx)
    scaler.save(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)

    if opt.sobol != -1:
        with open(opt.output + '/sobol_idx.pkl', 'rb') as file:
            sobol_idx = pickle.load(file)
        normed_trainx, normed_validx = sobol_reduce(
            normed_trainx, normed_validx,
            len(normed_trainx[0]) - 2 - opt.sobol, sobol_idx)
        logger.info('sobol SA reduced dimension:%d' % (opt.sobol))

    if opt.pca != -1:
        normed_trainx, normed_validx, _ = pca_nd(
            normed_trainx, normed_validx,
            len(normed_trainx[0]) - opt.pca, logger)
        logger.info('pca reduced dimension:%d' % (opt.pca))

    logger.info('final input length:%d' % (len(normed_trainx[0])))
    logger.info('Building network...')
    logger.info('Hidden layers = %r' % layers)
    logger.info('optimizer = %s' % (opt.optim))
    logger.info('Learning rate = %s' % opt_lr)
    logger.info('Epochs = %s' % opt_epochs)
    logger.info('L2 penalty = %f' % opt.l2)
    logger.info('Batch size = %d' % opt.batch)

    validy_ = validy.copy()  # for further convenience
    trainy_ = trainy.copy()
    if opt.gpu:  # store everything to GPU all at once
        logger.info('Using GPU acceleration')
        device = torch.device("cuda:0")
        normed_trainx = torch.Tensor(normed_trainx).to(device)
        trainy = torch.Tensor(trainy).to(device)
        normed_validx = torch.Tensor(normed_validx).to(device)
        validy = torch.Tensor(validy).to(device)

    if opt.optim == 'sgd':
        optimizer = torch.optim.SGD
    elif opt.optim == 'adam':
        optimizer = torch.optim.Adam
    elif opt.optim == 'rms':
        optimizer = torch.optim.RMSprop
    elif opt.optim == 'ada':
        optimizer = torch.optim.Adagrad

    model = fitting.TorchMLPRegressor(len(normed_trainx[0]),
                                      len(trainy[0]),
                                      layers,
                                      batch_size=opt.batch,
                                      is_gpu=opt.gpu != 0,
                                      args_opt={
                                          'optimizer': optimizer,
                                          'lr': opt.lr,
                                          'weight_decay': opt.l2
                                      })

    model.init_session()
    if opt.continuation:
        cpt = opt.output + '/model.pt'
        logger.info('Continue training from checkpoint %s' % (cpt))
        model.load(cpt)

    logger.info('Optimizer = %s' % (optimizer))

    header = 'Step Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split(
    )
    logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header))

    mse_history = []
    converge_times = 0
    mse_min = None
    model_saved = False
    converged = False
    all_epoch = sum(opt_epochs)
    total_epoch = 0

    for k, each_epoch in enumerate(opt_epochs):
        # implement separated learning rate
        model.reset_optimizer({
            'optimizer': optimizer,
            'lr': opt_lr[k],
            'weight_decay': opt.l2
        })
        for i in range(each_epoch):
            total_epoch += 1
            loss = model.fit_epoch(normed_trainx, trainy)
            if total_epoch % opt.check == 0:
                predy = model.predict_batch(normed_validx)
                mse = metrics.mean_squared_error(validy_, predy)
                mse_history.append(mse)
                err_line = '%-8i %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % (
                    total_epoch, loss.data.cpu().numpy()
                    if model.is_gpu else loss.data.numpy(), mse,
                    metrics.mean_signed_error(validy_, predy) * 100,
                    metrics.mean_unsigned_error(validy_, predy) * 100,
                    metrics.max_relative_error(validy_, predy) * 100,
                    metrics.accuracy(validy_, predy, 0.02) * 100,
                    metrics.accuracy(validy_, predy, 0.05) * 100,
                    metrics.accuracy(validy_, predy, 0.10) * 100)

                logger.info(err_line)

                if mse_min is None:
                    mse_min = mse
                elif mse < mse_min:
                    model.save(opt.output + '/model.pt')
                    model_saved = True
                    mse_min = mse

                if total_epoch > all_epoch * opt.minstop:
                    conv, cur_conv = validation.is_converge(
                        np.array(mse_history), nskip=25)
                    if conv:
                        logger.info('Model converge detected at epoch %d' %
                                    total_epoch)
                        converge_times += 1

                    if converge_times >= opt.maxconv and cur_conv:
                        logger.info('Model converged at epoch: %d' %
                                    total_epoch)
                        converged = True
                        break

    if not converged:
        logger.warning('Model not converged')

    if not model_saved:
        model.save(opt.output + '/model.pt')

    visualizer = visualize.LinearVisualizer(
        trainy_.reshape(-1),
        model.predict_batch(normed_trainx).reshape(-1), trainname, 'training')
    visualizer.append(validy_.reshape(-1),
                      model.predict_batch(normed_validx).reshape(-1),
                      validname, 'validation')
    visualizer.dump(opt.output + '/fit.txt')
    visualizer.dump_bad_molecules(opt.output + '/error-0.05.txt',
                                  'validation',
                                  threshold=0.05)
    visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt',
                                  'validation',
                                  threshold=0.1)
    visualizer.dump_bad_molecules(opt.output + '/error-0.15.txt',
                                  'validation',
                                  threshold=0.15)
    visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt',
                                  'validation',
                                  threshold=0.2)
    logger.info('Fitting result saved')

    if opt.visual:
        visualizer.scatter_yy(savefig=opt.output + '/error-train.png',
                              annotate_threshold=0,
                              marker='x',
                              lw=0.2,
                              s=5)
        visualizer.hist_error(savefig=opt.output + '/error-hist.png',
                              label='validation',
                              histtype='step',
                              bins=50)
        plt.show()