Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Alkane property fitting demo')
    parser.add_argument('-i', '--input', type=str, help='Data')
    parser.add_argument('-o',
                        '--output',
                        default='fp',
                        type=str,
                        help='Output directory')

    opt = parser.parse_args()

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    df = pd.read_csv(opt.input, sep='\s+', header=0)
    smiles_array = df.SMILES.values
    selector = preprocessing.Selector(smiles_array)
    sel_mol = preprocessing.Selector(df.SMILES.unique())
    fold = 5
    sel_mol.kfold_partition(1.0, fold)
    for n in range(fold):
        sel_mol.kfold_use(n)
        mol_train = sel_mol.training_set()
        mol_valid = sel_mol.validation_set()

        mol_train_dict = dict([(s, 1) for s in mol_train])
        mol_valid_dict = dict([(s, 1) for s in mol_valid])

        selector.train_index = np.array(
            [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool)
        selector.valid_index = np.array(
            [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool)
        selector.test_index = np.logical_not(
            np.logical_or(selector.train_index, selector.valid_index))

        selector.save(opt.output + '/part-%i.txt' % (n + 1))
Beispiel #2
0
def load_data(opt, logger):
    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    if opt.featrm == 'auto':
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))

    logger.info('loading data...')
    datax, datay, data_names = dataloader.load(filename=opt.input,
                                               target=opt.target,
                                               fps=opt.fp.split(','),
                                               featrm=featrm)

    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        selector.load(opt.part)
    else:
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')

    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()

    logger.info('loading model...')
    scaler = preprocessing.Scaler()
    scaler.load(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)
    model = fitting.TorchMLPRegressor(
        None,
        None,
        [],
        is_gpu=False,
    )
    model.load(opt.output + '/model.pt')
    #if  opt.pca != -1:
    #    normed_trainx, normed_validx, _ = pca_nd(normed_trainx, normed_validx, len(normed_trainx[0]) - opt.pca)
    return normed_validx, validy, model
Beispiel #3
0
def main():
    logger.info('Reading data and extra features...')
    fp_files = [] if opt.fp is None else opt.fp.split(',')
    fp_extra, y_array, name_array = dataloader.load(opt.input, opt.target, fp_files)
    smiles_list = [name.split()[0] for name in name_array]

    logger.info('Generating molecular graphs with %s...' % opt.graph)
    if opt.graph == 'rdk':
        graph_list, feats_list = smi2dgl(smiles_list)
    elif opt.graph == 'msd':
        msd_list = ['%s.msd' % base64.b64encode(smiles.encode()).decode() for smiles in smiles_list]
        graph_list, feats_list = msd2dgl(msd_list, '../data/msdfiles.zip')
    else:
        raise

    logger.info('Node feature example: (size=%d) %s' % (len(feats_list[0][0]), ','.join(map(str, feats_list[0][0]))))
    logger.info('Extra graph feature example: (size=%d) %s' % (len(fp_extra[0]), ','.join(map(str, fp_extra[0]))))
    logger.info('Output example: (size=%d) %s' % (len(y_array[0]), ','.join(map(str, y_array[0]))))

    if fp_extra.shape[-1] > 0:
        logger.info('Normalizing extra graph features...')
        scaler = preprocessing.Scaler()
        scaler.fit(fp_extra)
        scaler.save(opt.output + '/scale.txt')
        fp_extra = scaler.transform(fp_extra)

    logger.info('Selecting data...')
    selector = preprocessing.Selector(smiles_list)
    if opt.part is not None:
        logger.info('Loading partition file %s' % opt.part)
        selector.load(opt.part)
    else:
        logger.warning('Partition file not provided. Using auto-partition instead')
        selector.partition(0.8, 0.2)

    device = torch.device('cuda:0')
    # batched data for training set
    data_list = [[data[i] for i in np.where(selector.train_index)[0]]
                 for data in (graph_list, y_array, feats_list, fp_extra, name_array, smiles_list)]
    n_batch, (graphs_batch, y_batch, feats_node_batch, feats_extra_batch, names_batch) = \
        preprocessing.separate_batches(data_list[:-1], opt.batch, data_list[-1])
    bg_batch_train = [dgl.batch(graphs).to(device) for graphs in graphs_batch]
    y_batch_train = [torch.tensor(y, dtype=torch.float32, device=device) for y in y_batch]
    feats_node_batch_train = [torch.tensor(np.concatenate(feats_node), dtype=torch.float32, device=device)
                              for feats_node in feats_node_batch]
    feats_extra_batch_train = [torch.tensor(feats_extra, dtype=torch.float32, device=device)
                               for feats_extra in feats_extra_batch]
    # for plot
    y_train_array = np.concatenate(y_batch)
    names_train = np.concatenate(names_batch)

    # data for validation set
    graphs, y, feats_node, feats_extra, names_valid = \
        [[data[i] for i in np.where(selector.valid_index)[0]]
         for data in (graph_list, y_array, feats_list, fp_extra, name_array)]
    bg_valid, y_valid, feats_node_valid, feats_extra_valid = (
        dgl.batch(graphs).to(device),
        torch.tensor(y, dtype=torch.float32, device=device),
        torch.tensor(np.concatenate(feats_node), dtype=torch.float32, device=device),
        torch.tensor(feats_extra, dtype=torch.float32, device=device),
    )
    # for plot
    y_valid_array = y_array[selector.valid_index]

    logger.info('Training size = %d, Validation size = %d' % (len(y_train_array), len(y_valid_array)))
    logger.info('Batches = %d, Batch size ~= %d' % (n_batch, opt.batch))

    in_feats_node = feats_list[0].shape[-1]
    in_feats_extra = fp_extra[0].shape[-1]
    n_heads = list(map(int, opt.head.split(',')))

    logger.info('Building network...')
    logger.info('Conv layers = %s' % n_heads)
    logger.info('Learning rate = %s' % opt.lr)
    logger.info('L2 penalty = %f' % opt.l2)

    model = GATModel(in_feats_node, opt.embed, n_head_list=n_heads, extra_feats=in_feats_extra)
    model.cuda()
    print(model)
    for name, param in model.named_parameters():
        print(name, param.data.shape)

    header = 'Step MaxRE(t) Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split()
    logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header))

    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.l2)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt.lrsteps, gamma=opt.lrgamma)
    for epoch in range(opt.epoch):
        model.train()
        if (epoch + 1) % opt.check == 0:
            pred_train = [None] * n_batch
        for ib in np.random.permutation(n_batch):
            optimizer.zero_grad()
            pred = model(bg_batch_train[ib], feats_node_batch_train[ib], feats_extra_batch_train[ib])
            loss = F.mse_loss(pred, y_batch_train[ib])
            loss.backward()
            optimizer.step()
            if (epoch + 1) % opt.check == 0:
                pred_train[ib] = pred.detach().cpu().numpy()
        scheduler.step()

        if (epoch + 1) % opt.check == 0:
            model.eval()
            pred_train = np.concatenate(pred_train)
            pred_valid = model(bg_valid, feats_node_valid, feats_extra_valid).detach().cpu().numpy()
            err_line = '%-8i %8.1f %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % (
                epoch + 1,
                metrics.max_relative_error(y_train_array, pred_train) * 100,
                metrics.mean_squared_error(y_train_array, pred_train),
                metrics.mean_squared_error(y_valid_array, pred_valid),
                metrics.mean_signed_error(y_valid_array, pred_valid) * 100,
                metrics.mean_unsigned_error(y_valid_array, pred_valid) * 100,
                metrics.max_relative_error(y_valid_array, pred_valid) * 100,
                metrics.accuracy(y_valid_array, pred_valid, 0.02) * 100,
                metrics.accuracy(y_valid_array, pred_valid, 0.05) * 100,
                metrics.accuracy(y_valid_array, pred_valid, 0.10) * 100)

            logger.info(err_line)
    torch.save(model, opt.output + '/model.pt')

    visualizer = visualize.LinearVisualizer(y_train_array.reshape(-1), pred_train.reshape(-1), names_train, 'train')
    visualizer.append(y_valid_array.reshape(-1), pred_valid.reshape(-1), names_valid, 'valid')
    visualizer.dump(opt.output + '/fit.txt')
    visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt', 'valid', threshold=0.1)
    visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt', 'valid', threshold=0.2)
    visualizer.scatter_yy(savefig=opt.output + '/error-train.png', annotate_threshold=0.1, marker='x', lw=0.2, s=5)
    visualizer.hist_error(savefig=opt.output + '/error-hist.png', label='valid', histtype='step', bins=50)
    plt.show()
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Alkane property fitting demo')
    parser.add_argument('-i', '--input', type=str, help='Data')
    parser.add_argument('-o',
                        '--output',
                        default='fp',
                        type=str,
                        help='Output directory')
    parser.add_argument('--fold',
                        default=0,
                        type=int,
                        help='using n-fold partition as validation set')
    parser.add_argument('--similarity',
                        default=-1.0,
                        type=float,
                        help='using similarity partition as validation set')

    opt = parser.parse_args()

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    smiles_list = []
    smiles_list_training = None
    input_list = opt.input.split(',')
    for file in input_list:
        df = pd.read_csv(file, sep='\s+', header=0)
        if 'train' in file:
            smiles_list_training = df.SMILES.unique().tolist()
        else:
            smiles_array = df.SMILES.values
    selector = preprocessing.Selector(smiles_array)
    sel_mol = preprocessing.Selector(smiles_array)

    if len(input_list) == 2 and smiles_list_training is not None:
        sel_mol.partition_smiles_list(
            smiles_list_training=smiles_list_training)
        mol_train = sel_mol.training_set()
        mol_valid = sel_mol.validation_set()

        mol_train_dict = dict([(s, 1) for s in mol_train])
        mol_valid_dict = dict([(s, 1) for s in mol_valid])

        selector.training_index = np.array(
            [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool)
        selector.validation_index = np.array(
            [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool)
        selector.test_index = np.logical_not(
            np.logical_or(selector.training_index, selector.validation_index))

        selector.save(opt.output + '/part.txt')

    elif opt.similarity > 0.0:
        sel_mol.similarity_partition(cutoff=opt.similarity)
        mol_train = sel_mol.training_set()
        mol_valid = sel_mol.validation_set()

        mol_train_dict = dict([(s, 1) for s in mol_train])
        mol_valid_dict = dict([(s, 1) for s in mol_valid])

        selector.training_index = np.array(
            [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool)
        selector.validation_index = np.array(
            [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool)
        selector.test_index = np.logical_not(
            np.logical_or(selector.training_index, selector.validation_index))

        selector.save(opt.output + '/part-similarity-%.2f.txt' %
                      (opt.similarity))
    elif opt.fold != 0:
        fold = opt.fold
        sel_mol.kfold_partition(1.0, fold)
        for n in range(fold):
            sel_mol.kfold_use(n)
            mol_train = sel_mol.training_set()
            mol_valid = sel_mol.validation_set()

            mol_train_dict = dict([(s, 1) for s in mol_train])
            mol_valid_dict = dict([(s, 1) for s in mol_valid])

            selector.training_index = np.array(
                [mol_train_dict.get(m, 0) for m in smiles_array], dtype=bool)
            selector.validation_index = np.array(
                [mol_valid_dict.get(m, 0) for m in smiles_array], dtype=bool)
            selector.test_index = np.logical_not(
                np.logical_or(selector.training_index,
                              selector.validation_index))

            selector.save(opt.output + '/part-%i.txt' % (n + 1))
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Alkane property fitting demo')
    parser.add_argument('-i', '--input', type=str, help='Data')
    parser.add_argument('-f', '--fp', type=str, help='Fingerprints')
    parser.add_argument('-o',
                        '--output',
                        default='out',
                        type=str,
                        help='Output directory')
    parser.add_argument('-t',
                        '--target',
                        default='raw_density',
                        type=str,
                        help='Fitting target')
    parser.add_argument('-p',
                        '--part',
                        default='',
                        type=str,
                        help='Partition cache file')
    parser.add_argument('-l',
                        '--layer',
                        default='16,16',
                        type=str,
                        help='Size of hidden layers')
    parser.add_argument('--visual',
                        default=1,
                        type=int,
                        help='Visualzation data')
    parser.add_argument('--gpu', default=1, type=int, help='Using gpu')
    parser.add_argument('--epoch',
                        default="500,2000,2500",
                        type=str,
                        help='Number of epochs')
    parser.add_argument('--batch', default=1000, type=int, help='Batch size')
    parser.add_argument('--lr',
                        default="0.01,0.001,0.0001",
                        type=str,
                        help='Initial learning rate')
    parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty')
    parser.add_argument('--check',
                        default=50,
                        type=int,
                        help='Number of epoch that do convergence check')
    parser.add_argument('--minstop',
                        default=0.2,
                        type=float,
                        help='Minimum fraction of step to stop')
    parser.add_argument('--maxconv',
                        default=2,
                        type=int,
                        help='Times of true convergence that makes a stop')
    parser.add_argument('--featrm',
                        default='',
                        type=str,
                        help='Remove features')
    parser.add_argument('--optim', default='rms', type=str, help='optimizer')
    parser.add_argument('--continuation',
                        default=False,
                        type=bool,
                        help='continue training')
    parser.add_argument('--pca',
                        default=-1,
                        type=int,
                        help='dimension to discard')
    parser.add_argument(
        '--sobol',
        default=-1,
        type=int,
        help='dimensions to reduce according to sensitivity analysis')

    opt = parser.parse_args()

    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    opt_lr = list(map(float, opt.lr.split(',')))
    opt_epochs = list(map(int, opt.epoch.split(',')))

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    logger = logging.getLogger('train')
    logger.setLevel(logging.INFO)
    flog = logging.FileHandler(opt.output + '/log.txt', mode='w')
    flog.setLevel(logging.INFO)
    formatter = logging.Formatter(
        fmt='[%(asctime)s] (%(levelname)s) %(message)s',
        datefmt='%Y-%d-%m %H:%M:%S')
    flog.setFormatter(formatter)
    clog = logging.StreamHandler()
    clog.setFormatter(formatter)
    logger.addHandler(flog)
    logger.addHandler(clog)

    if sys.platform == 'linux':
        logger.info('Use non-interactive Agg backend for matplotlib on linux')
        matplotlib.use('Agg')

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))
    logger.info('Remove Feature: %s' % featrm)

    logger.info('Reading data...')
    datax, datay, data_names = dataloader.load(filename=opt.input,
                                               target=opt.target,
                                               fps=opt.fp.split(','),
                                               featrm=featrm)

    # Store fingerprint identifier files
    for fp in opt.fp.split(','):
        if os.path.exists(fp + '.idx') and Path(fp).parent.absolute() != Path(
                opt.output).absolute():
            shutil.copy(fp + '.idx', opt.output)

    logger.info('Selecting data...')
    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        logger.info('Loading partition file %s' % opt.part)
        selector.load(opt.part)
    else:
        logger.warning(
            "Partition file not found. Using auto-partition instead.")
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()

    logger.info('Training size = %d, Validation size = %d' %
                (len(trainx), len(validx)))
    logger.info('X input example: (size=%d) %s' %
                (len(datax[0]), ','.join(map(str, datax[0]))))
    logger.info('Y input example: (size=%d) %s' %
                (len(datay[0]), ','.join(map(str, datay[0]))))
    logger.info('Normalizing...')
    scaler = preprocessing.Scaler()
    scaler.fit(trainx)
    scaler.save(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)

    if opt.sobol != -1:
        with open(opt.output + '/sobol_idx.pkl', 'rb') as file:
            sobol_idx = pickle.load(file)
        normed_trainx, normed_validx = sobol_reduce(
            normed_trainx, normed_validx,
            len(normed_trainx[0]) - 2 - opt.sobol, sobol_idx)
        logger.info('sobol SA reduced dimension:%d' % (opt.sobol))

    if opt.pca != -1:
        normed_trainx, normed_validx, _ = pca_nd(
            normed_trainx, normed_validx,
            len(normed_trainx[0]) - opt.pca, logger)
        logger.info('pca reduced dimension:%d' % (opt.pca))

    logger.info('final input length:%d' % (len(normed_trainx[0])))
    logger.info('Building network...')
    logger.info('Hidden layers = %r' % layers)
    logger.info('optimizer = %s' % (opt.optim))
    logger.info('Learning rate = %s' % opt_lr)
    logger.info('Epochs = %s' % opt_epochs)
    logger.info('L2 penalty = %f' % opt.l2)
    logger.info('Batch size = %d' % opt.batch)

    validy_ = validy.copy()  # for further convenience
    trainy_ = trainy.copy()
    if opt.gpu:  # store everything to GPU all at once
        logger.info('Using GPU acceleration')
        device = torch.device("cuda:0")
        normed_trainx = torch.Tensor(normed_trainx).to(device)
        trainy = torch.Tensor(trainy).to(device)
        normed_validx = torch.Tensor(normed_validx).to(device)
        validy = torch.Tensor(validy).to(device)

    if opt.optim == 'sgd':
        optimizer = torch.optim.SGD
    elif opt.optim == 'adam':
        optimizer = torch.optim.Adam
    elif opt.optim == 'rms':
        optimizer = torch.optim.RMSprop
    elif opt.optim == 'ada':
        optimizer = torch.optim.Adagrad

    model = fitting.TorchMLPRegressor(len(normed_trainx[0]),
                                      len(trainy[0]),
                                      layers,
                                      batch_size=opt.batch,
                                      is_gpu=opt.gpu != 0,
                                      args_opt={
                                          'optimizer': optimizer,
                                          'lr': opt.lr,
                                          'weight_decay': opt.l2
                                      })

    model.init_session()
    if opt.continuation:
        cpt = opt.output + '/model.pt'
        logger.info('Continue training from checkpoint %s' % (cpt))
        model.load(cpt)

    logger.info('Optimizer = %s' % (optimizer))

    header = 'Step Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split(
    )
    logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header))

    mse_history = []
    converge_times = 0
    mse_min = None
    model_saved = False
    converged = False
    all_epoch = sum(opt_epochs)
    total_epoch = 0

    for k, each_epoch in enumerate(opt_epochs):
        # implement separated learning rate
        model.reset_optimizer({
            'optimizer': optimizer,
            'lr': opt_lr[k],
            'weight_decay': opt.l2
        })
        for i in range(each_epoch):
            total_epoch += 1
            loss = model.fit_epoch(normed_trainx, trainy)
            if total_epoch % opt.check == 0:
                predy = model.predict_batch(normed_validx)
                mse = metrics.mean_squared_error(validy_, predy)
                mse_history.append(mse)
                err_line = '%-8i %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % (
                    total_epoch, loss.data.cpu().numpy()
                    if model.is_gpu else loss.data.numpy(), mse,
                    metrics.mean_signed_error(validy_, predy) * 100,
                    metrics.mean_unsigned_error(validy_, predy) * 100,
                    metrics.max_relative_error(validy_, predy) * 100,
                    metrics.accuracy(validy_, predy, 0.02) * 100,
                    metrics.accuracy(validy_, predy, 0.05) * 100,
                    metrics.accuracy(validy_, predy, 0.10) * 100)

                logger.info(err_line)

                if mse_min is None:
                    mse_min = mse
                elif mse < mse_min:
                    model.save(opt.output + '/model.pt')
                    model_saved = True
                    mse_min = mse

                if total_epoch > all_epoch * opt.minstop:
                    conv, cur_conv = validation.is_converge(
                        np.array(mse_history), nskip=25)
                    if conv:
                        logger.info('Model converge detected at epoch %d' %
                                    total_epoch)
                        converge_times += 1

                    if converge_times >= opt.maxconv and cur_conv:
                        logger.info('Model converged at epoch: %d' %
                                    total_epoch)
                        converged = True
                        break

    if not converged:
        logger.warning('Model not converged')

    if not model_saved:
        model.save(opt.output + '/model.pt')

    visualizer = visualize.LinearVisualizer(
        trainy_.reshape(-1),
        model.predict_batch(normed_trainx).reshape(-1), trainname, 'training')
    visualizer.append(validy_.reshape(-1),
                      model.predict_batch(normed_validx).reshape(-1),
                      validname, 'validation')
    visualizer.dump(opt.output + '/fit.txt')
    visualizer.dump_bad_molecules(opt.output + '/error-0.05.txt',
                                  'validation',
                                  threshold=0.05)
    visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt',
                                  'validation',
                                  threshold=0.1)
    visualizer.dump_bad_molecules(opt.output + '/error-0.15.txt',
                                  'validation',
                                  threshold=0.15)
    visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt',
                                  'validation',
                                  threshold=0.2)
    logger.info('Fitting result saved')

    if opt.visual:
        visualizer.scatter_yy(savefig=opt.output + '/error-train.png',
                              annotate_threshold=0,
                              marker='x',
                              lw=0.2,
                              s=5)
        visualizer.hist_error(savefig=opt.output + '/error-hist.png',
                              label='validation',
                              histtype='step',
                              bins=50)
        plt.show()
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Alkane property fitting demo')
    parser.add_argument('-i', '--input', type=str, help='Data')
    parser.add_argument('-f', '--fp', type=str, help='Fingerprints')
    parser.add_argument('-o',
                        '--output',
                        default='out',
                        type=str,
                        help='Output directory')
    parser.add_argument('-t',
                        '--target',
                        default='raw_density',
                        type=str,
                        help='Fitting target')
    parser.add_argument('-p',
                        '--part',
                        default='',
                        type=str,
                        help='Partition cache file')
    parser.add_argument('-l',
                        '--layer',
                        default='16,16',
                        type=str,
                        help='Size of hidden layers')
    parser.add_argument('--visual',
                        default=1,
                        type=int,
                        help='Visualzation data')
    parser.add_argument('--gpu', default=1, type=int, help='Using gpu')
    parser.add_argument('--epoch',
                        default="200",
                        type=str,
                        help='Number of epochs')
    parser.add_argument('--step',
                        default=500,
                        type=int,
                        help='Number of steps trained for each batch')
    parser.add_argument('--batch',
                        default=int(1e9),
                        type=int,
                        help='Batch size')
    parser.add_argument('--lr',
                        default="0.005",
                        type=str,
                        help='Initial learning rate')
    parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty')
    parser.add_argument(
        '--check',
        default=10,
        type=int,
        help='Number of epoch that do convergence check. Set 0 to disable.')
    parser.add_argument('--minstop',
                        default=0.2,
                        type=float,
                        help='Minimum fraction of step to stop')
    parser.add_argument('--maxconv',
                        default=2,
                        type=int,
                        help='Times of true convergence that makes a stop')
    parser.add_argument('--featrm',
                        default='',
                        type=str,
                        help='Remove features')
    parser.add_argument('--optim', default='rms', type=str, help='optimizer')
    parser.add_argument('--continuation',
                        default=False,
                        type=bool,
                        help='continue training')
    parser.add_argument('--pca',
                        default=0,
                        type=int,
                        help='dimension to discard')
    parser.add_argument(
        '--sobol',
        default=-1,
        type=int,
        help='dimensions to reduce according to sensitivity analysis')

    opt = parser.parse_args()

    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    opt_lr = list(map(float, opt.lr.split(',')))
    opt_epochs = list(map(int, opt.epoch.split(',')))

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    logger = logging.getLogger('train')
    logger.setLevel(logging.INFO)
    flog = logging.FileHandler(opt.output + '/log.txt', mode='w')
    flog.setLevel(logging.INFO)
    formatter = logging.Formatter(
        fmt='[%(asctime)s] (%(levelname)s) %(message)s',
        datefmt='%Y-%d-%m %H:%M:%S')
    flog.setFormatter(formatter)
    clog = logging.StreamHandler()
    clog.setFormatter(formatter)
    logger.addHandler(flog)
    logger.addHandler(clog)

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))
    logger.info('Remove Feature: %s' % featrm)

    logger.info('Reading data...')
    datax, datay, data_names = dataloader.load(filename=opt.input,
                                               target=opt.target,
                                               fps=opt.fp.split(','),
                                               featrm=featrm)

    logger.info('Selecting data...')
    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        logger.info('Loading partition file %s' % opt.part)
        selector.load(opt.part)
    else:
        logger.warning(
            "Partition file not found. Using auto-partition instead.")
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()

    logger.info('Training size = %d, Validation size = %d' %
                (len(trainx), len(validx)))
    logger.info('X input example: (size=%d) %s' %
                (len(datax[0]), ','.join(map(str, datax[0]))))
    logger.info('Y input example: (size=%d) %s' %
                (len(datay[0]), ','.join(map(str, datay[0]))))
    logger.info('Normalizing...')
    scaler = preprocessing.Scaler()
    scaler.fit(trainx)
    scaler.save(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)

    logger.info('Building network...')
    logger.info('Hidden layers = %r' % layers)
    logger.info('optimizer = %s' % (opt.optim))
    logger.info('Initial learning rate = %f' % opt_lr[0])
    logger.info('L2 penalty = %f' % opt.l2)
    logger.info('Total %d epochs' % sum(opt_epochs))
    logger.info('Batch = (%d values x %d steps)' % (opt.batch, opt.step))

    if opt.optim == 'sgd':
        optimizer = torch.optim.SGD
    elif opt.optim == 'adam':
        optimizer = torch.optim.Adam
    elif opt.optim == 'rms':
        optimizer = torch.optim.RMSprop
    elif opt.optim == 'ada':
        optimizer = torch.optim.Adagrad

    result = []

    for i in range(60, len(trainx[0]), 5):
        logger.info('Start PCA trainning of dimension ' + str(i))
        pca_i_result = pca_train(i, normed_trainx, trainy, normed_validx,
                                 validy, opt, logger, layers, opt_lr,
                                 opt_epochs, optimizer)
        logger.info('PCA reduced result of dimension %d :' % (i))
        logger.info('%.3f variance_explained,\t acc2: %.3f,\t MSE %.3f ' %
                    (pca_i_result))
        result.append(pca_i_result)

    logger.info(result)
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dir', default='out', help='Model directory')
    parser.add_argument('-i', '--input', help='Data')
    parser.add_argument('-f', '--fp', help='Fingerprints')
    parser.add_argument('-t', '--target', help='Target property')
    parser.add_argument('-p', '--part', help='Partition file')
    parser.add_argument('--gpu', default=1, type=int, help='Using GPU')
    parser.add_argument('--visual',
                        default=1,
                        type=int,
                        help='Visualzation data')
    parser.add_argument('--visualx',
                        default='',
                        help='Extra visualisze on special x')
    parser.add_argument('--dump', default='', help='Output of fitting results')
    parser.add_argument('--featrm',
                        default='',
                        type=str,
                        help='Remove features')

    opt = parser.parse_args()

    model = fitting.TorchMLPRegressor(None, None, [])
    model.is_gpu = opt.gpu == 1
    model.load(opt.dir + '/model.pt')

    scaler = preprocessing.Scaler()
    scaler.load(opt.dir + '/scale.txt')

    if opt.featrm == 'auto':
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))

    datax, datay, data_names = mdlearn.dataloader.load(filename=opt.input,
                                                       target=opt.obj,
                                                       fps=opt.fp.split(','),
                                                       featrm=featrm)

    selector = preprocessing.Selector(datax, datay, data_names)
    selector.load(opt.part)

    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()
    testx, testy, testname = selector.test_set()

    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)
    normed_testx = scaler.transform(testx)

    trainy = trainy.flatten()
    validy = validy.flatten()
    testy = testy.flatten()

    trainy_est = model.predict_batch(normed_trainx).flatten()
    validy_est = model.predict_batch(normed_validx).flatten()
    testy_est = model.predict_batch(normed_testx).flatten()

    def evaluate_model(y, y_est):
        mse = metrics.mean_squared_error(y, y_est)
        ae = np.average(metrics.abs_absolute_error(y, y_est))
        ave_y = np.average(y)
        ave_y_est = np.average(y_est)
        bias = (ave_y_est - ave_y)

        eval_results = OrderedDict()
        eval_results['MSE'] = mse
        eval_results['RMSE'] = np.sqrt(mse)
        eval_results['AE'] = ae
        eval_results['Max AAE'] = metrics.max_absolute_error(y, y_est)
        eval_results['Bias'] = bias

        eval_results['RRMSE'] = np.sqrt(mse) / np.abs(ave_y)
        eval_results['MARE'] = ae / np.abs(ave_y)
        eval_results['Max ARE'] = metrics.max_relative_error(y, y_est)
        eval_results['RBias'] = bias / np.abs(ave_y)

        eval_results['Accuracy1%'] = metrics.accuracy(y, y_est, 0.01)
        eval_results['Accuracy2%'] = metrics.accuracy(y, y_est, 0.02)
        eval_results['Accuracy5%'] = metrics.accuracy(y, y_est, 0.05)
        eval_results['Accuracy10%'] = metrics.accuracy(y, y_est, 0.1)

        return eval_results

    results = []

    results.append(evaluate_model(trainy, trainy_est))
    results.append(evaluate_model(validy, validy_est))
    results.append(evaluate_model(testy, testy_est))
    results.append(
        evaluate_model(np.concatenate((trainy, validy, testy)),
                       np.concatenate((trainy_est, validy_est, testy_est))))

    print('Dataset\t%s' % ('\t'.join(results[0].keys())))

    fmt = lambda x: '%.3g' % x

    for name, result in zip(['Training', 'Validation', 'Test', 'Overall'],
                            results):
        print('%s\t%s' % (name, '\t'.join([fmt(v) for v in result.values()])))

    visualizer = visualize.LinearVisualizer(trainy, trainy_est, trainname,
                                            'training')
    visualizer.append(validy, validy_est, validname, 'validation')
    visualizer.append(testy, testy_est, testname, 'test')
    if opt.dump:
        visualizer.dump(opt.dump)

    if opt.visual:
        visualizer.scatter_yy(annotate_threshold=0.1,
                              marker='x',
                              lw=0.2,
                              s=5,
                              figure_name='Value')
        visualizer.scatter_error(annotate_threshold=0.1,
                                 marker='x',
                                 lw=0.2,
                                 s=5,
                                 figure_name='Error')
        visualizer.hist_error(label='test',
                              histtype='step',
                              bins=50,
                              figure_name='Error Distribution')

        if opt.visualx:

            for i in map(int, opt.visualx.split(',')):
                visualizer2 = visualize.LinearVisualizer(
                    trainx[:, i], trainy_est - trainy, trainname, 'training')
                visualizer2.append(validx[:, i], validy_est - validy,
                                   validname, 'validation')
                visualizer2.append(testx[:, i], testy_est - testy, testname,
                                   'test')
                visualizer2.scatter_yy(ref=None,
                                       annotate_threshold=-1,
                                       marker='x',
                                       lw=0.2,
                                       s=5,
                                       figure_name=str(i))

        plt.show()