def get_model(pretrained_model_file, latent_dim, n_init_retrain_epochs, n_retrain_epochs, retrain_from_scratch, ite, save_dir, data_enc, data_scores, data_weighter):
    """ load or train the model """

    if ite == 1:
        print_flush("Loading pre-trained model...")
        new_weights_dir = pretrained_model_file

    else:
        print_flush("\t(Re-)training model...")

        # compute sample weights (multiply scores by -1 as our goal is _minimization_)
        sample_weights = data_weighter.weighting_function(-1 * data_scores)
        if data_weighter.weight_type == "rank":
            # for rank-based weighting, normalize the weights and reduce their variance
            sample_weights = DataWeighter.normalize_weights(sample_weights)
            sample_weights, data = DataWeighter.reduce_weight_variance(sample_weights, data_enc)
        else:
            data = data_enc

        # train model
        new_weights_dir = str(save_dir / 'expr.hdf5')
        prev_weights_dir = pretrained_model_file if ite == 2 else new_weights_dir.replace(f"opt{ite}", f"opt{ite-1}")
        n_epochs = int(np.ceil(n_init_retrain_epochs if ite == 1 and n_init_retrain_epochs else n_retrain_epochs))
        train_model(retrain_from_scratch, latent_dim, n_epochs, data, new_weights_dir, prev_weights_dir, sample_weights)

    # load trained model
    model = EquationGrammarModel(new_weights_dir, latent_rep_size=latent_dim)

    return model
Ejemplo n.º 2
0
def latent_sampling(n_samples, model, latent_dim, n_decode_attempts):
    """ Draws samples from latent space and appends to the dataset """

    print_flush("\t\tPicking new inputs via sampling...")
    new_latents = np.random.randn(n_samples, latent_dim)
    new_inputs = model.decode_from_latent_space(zs=new_latents, n_decode_attempts=n_decode_attempts)
    new_scores = expr_data.score_function(new_inputs)

    return new_inputs, new_scores
Ejemplo n.º 3
0
def update_dataset_and_weights(new_inputs, new_scores, data_str, data_enc,
                               data_scores, model):
    """ update the dataet and the sample weights """

    # discard invalid (None) inputs and their corresponding scores
    valid_idx = np.array(new_inputs) != None
    valid_inputs = list(new_inputs[valid_idx])
    valid_scores = new_scores[valid_idx]
    print_flush("\tDiscarding {}/{} new inputs that are invalid!".format(
        len(new_inputs) - len(valid_inputs), len(new_inputs)))

    # add new inputs and scores to dataset, both as plain string and one-hot vector
    print_flush("\tAppending new valid inputs to dataset...")
    data_str += valid_inputs
    new_inputs_one_hot = model.smiles_to_one_hot(valid_inputs)
    data_enc = np.append(data_enc, new_inputs_one_hot, axis=0)
    data_scores = np.append(data_scores, valid_scores)

    return data_str, data_enc, data_scores
Ejemplo n.º 4
0
def get_latent_encodings(use_test_set,
                         use_full_data_for_gp,
                         model,
                         data_file,
                         data_scores,
                         data_str,
                         n_best,
                         n_rand,
                         bs=5000):
    """ get latent encodings and split data into train and test data """

    print_flush(
        "\tComputing latent training data encodings and corresponding scores..."
    )
    n_batches = int(np.ceil(len(data_str) / bs))
    Xs = [
        model.encode(data_str[i * bs:(i + 1) * bs])
        for i in tqdm(range(n_batches))
    ]
    X = np.concatenate(Xs, axis=0)
    y = data_scores.reshape((-1, 1))

    return subsample_dataset(X, y, data_file, use_test_set,
                             use_full_data_for_gp, n_best, n_rand)
Ejemplo n.º 5
0
def latent_optimization(n_bo_iters, opt_iter, model, seed, n_inducing_points, directory, start_time, n_decode_attempts, use_test_set, use_full_data_for_gp, data_scores, data_str, n_best, n_rand):
    """ run Bayesian optimization loop """

    # compute latent encodings and corresponding scores to fit the GP on
    data_file = directory / "data.npz"
    X_train, y_train, X_test, y_test, X_mean, y_mean, X_std, y_std = expr_data.get_latent_encodings(
        use_test_set, use_full_data_for_gp, model, data_file, data_scores, data_str, n_best, n_rand)

    gp_file = None
    new_inputs = np.array([])
    new_scores = np.array([])
    for ite in range(n_bo_iters):
        print_flush("\tBO ITERATION {}/{} ({:.3f}s)".format(ite + 1, n_bo_iters, time.time() - start_time))

        # set random seed
        iter_seed = seed * ((opt_iter - 1) * n_bo_iters + ite)
        np.random.seed(iter_seed)
        tf.random.set_seed(iter_seed)

        # fit the GP model (using gpflow)
        print_flush("\t\tFitting predictive model...")
        init = kmean_init = gp_file == None
        new_gp_file = directory / "gp_iter.npz"
        gp_train(nZ=n_inducing_points, data_file=data_file, logfile=directory / "gp_train.log", save_file=new_gp_file,
                 n_perf_measure=1, use_test_set=X_test is not None, init=init, kmeans_init=kmean_init, gp_file=gp_file)
        gp_file = new_gp_file

        # identify the best inputs (using gpflow)
        print_flush("\t\tPicking new inputs via optimization...")
        new_latents = gp_opt(gp_file, data_file, directory / "gp_opt_res.npy", 1, directory / "gp_opt.log")
        new_latents = new_latents * X_std + X_mean

        # compute and save new inputs and corresponding scores
        new_inputs = np.append(new_inputs, model.decode_from_latent_space(zs=new_latents, n_decode_attempts=n_decode_attempts))
        new_scores = np.append(new_scores, expr_data.score_function([new_inputs[-1]]))

        # add new inputs and scores to training set and get new dataset filename
        X_train, y_train = expr_data.append_trainset(X_train, y_train, new_latents, np.array([new_scores[-1]]))
        expr_data.save_data(X_train, y_train, X_test, y_test, X_mean, X_std, y_mean, y_std, data_file)

    return new_inputs, new_scores
def main():
    """ Train model from scratch """

    # parse arguments
    parser = argparse.ArgumentParser()
    parser = DataWeighter.add_weight_args(parser)
    parser.add_argument(
        "--latent_dim",
        type=int,
        default=25,
        help="dimensionality of latent space",
    )
    parser.add_argument(
        "--data_dir",
        type=str,
        default="assets/data/expr"
        help="directory of datasets",
    )
    parser.add_argument(
        "--root_dir",
        type=str,
        required=True,
        help="directory of model",
    )
    parser.add_argument(
        "--n_epochs",
        type=int,
        default=50,
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=512,
    )
    parser.add_argument(
        "--ignore_percentile",
        type=int,
        default=50,
        help="percentile of scores to ignore"
    )
    weight_group.add_argument(
        "--k",
        type=str,
        default="inf",
        help="k parameter for rank weighting",
    )
    args = parser.parse_args()
    args.weight_type = "rank"
    args.rank_weight_k = float(args.k)

    # print python command run
    print_flush(' '.join(sys.argv[1:]))

    # set random seed
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)

    # run tensorflow in eager mode
    tf.config.experimental_run_functions_eagerly(True)

    # load and subsample equation dataset and compute corresponding scores
    data_str = load_data_str(Path(args.data_dir))
    data_enc = load_data_enc(Path(args.data_dir))
    data_scores = score_function(data_str)
    perc = np.percentile(data_scores, args.ignore_percentile)
    perc_idx = data_scores >= perc
    data = data_enc[perc_idx]
    scores = -data_scores[perc_idx]

    # compute sample weights (multiply scores by -1 as our goal is _minimization_)
    data_weighter = DataWeighter(args)
    sample_weights = DataWeighter.normalize_weights(data_weighter.weighting_function(scores))
    sample_weights, data = DataWeighter.reduce_weight_variance(sample_weights, data)

    # train model
    model_dir = Path(args.root_dir) / f"expr-k_{args.k}.hdf5"
    train_model(True, args.latent_dim, args.n_epochs, data, model_dir, sample_weights=sample_weights, batch_size=args.batch_size)
Ejemplo n.º 7
0
def main():
    """ main """

    # parse arguments
    parser = argparse.ArgumentParser()
    parser = add_common_args(parser)
    parser = add_gp_args(parser)
    parser = DataWeighter.add_weight_args(parser)
    parser.add_argument(
        "--latent_dim",
        type=int,
        default=25,
        help="dimensionality of latent space",
    )
    parser.add_argument(
        "--data_dir",
        type=str,
        default="/home/ead54/rds/hpc-work/project_data/data",
        help="directory of datasets",
    )
    parser.add_argument(
        '--retrain_from_scratch',
        dest="retrain_from_scratch",
        action="store_true",
        help="flag to retrain the generative model from scratch in every iteration"
    )
    parser.add_argument(
        "--n_data",
        type=int,
        default=100000,
        help="number of datapoints to use",
    )
    parser.add_argument(
        "--ignore_percentile",
        type=int,
        default=0,
        help="percentile of scores to ignore"
    )
    parser.add_argument(
        '--use_test_set',
        dest="use_test_set",
        action="store_true",
        help="flag to use a test set for evaluating the sparse GP"
    )
    parser.add_argument(
        '--use_full_data_for_gp',
        dest="use_full_data_for_gp",
        action="store_true",
        help="flag to use the full dataset for training the GP"
    )
    parser.add_argument(
        "--n_decode_attempts",
        type=int,
        default=100,
        help="number of decoding attempts",
    )

    args = parser.parse_args()

    # set random seed
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)

    # run tensorflow in eager mode
    tf.config.experimental_run_functions_eagerly(True)

    # create result directory
    directory = Path(args.result_root)
    directory.mkdir(parents=True, exist_ok=True)

    # define DataWeighter
    data_weighter = DataWeighter(args)

    # get initial dataset
    data_str, data_enc, data_scores = expr_data.get_initial_dataset_and_weights(
        Path(args.data_dir), args.ignore_percentile, args.n_data)

    # print python command run
    cmd = ' '.join(sys.argv[1:])
    print_flush(f"{cmd}\n")

    # set up results dictionary
    results = dict(
        params=cmd,
        opt_points=[],
        opt_point_properties=[],
        opt_model_version=[],
        sample_points=[],
        sample_versions=[],
        sample_properties=[],
    )

    start_time = time.time()
    n_opt_iters = int(np.ceil(args.query_budget / args.retraining_frequency))
    n_bo_iters = args.query_budget // n_opt_iters
    for ite in range(1, n_opt_iters + 1):
        print_flush("OPTIMIZATION ITERATION {}/{} ({:.3f}s)".format(ite, n_opt_iters, time.time() - start_time))
        opt_dir = directory / "opt{}".format(ite)
        opt_dir.mkdir(exist_ok=True)

        # load/update model
        model = train_expr.get_model(args.pretrained_model_file, args.latent_dim, args.n_init_retrain_epochs,
                                     args.n_retrain_epochs, args.retrain_from_scratch, ite, opt_dir, data_enc, data_scores, data_weighter)

        # draw and store samples from model's latent space
        if args.samples_per_model > 0:
            sample_x, sample_y = latent_sampling(args.samples_per_model, model, args.latent_dim, args.n_decode_attempts)
            results["sample_points"].append(sample_x)
            results["sample_properties"].append(sample_y)
            results["sample_versions"].append(ite-1)

        # select new inputs via optimization or sampling
        if args.lso_strategy == "opt":
            new_inputs, new_scores = latent_optimization(n_bo_iters, ite, model, args.seed, args.n_inducing_points, opt_dir, start_time, args.n_decode_attempts,
                args.use_test_set, args.use_full_data_for_gp, data_scores, data_str, args.n_best_points, args.n_rand_points)
        elif args.lso_strategy == "sample":
            new_inputs, new_scores = latent_sampling(n_bo_iters, model, args.latent_dim, args.n_decode_attempts)
        else:
            raise NotImplementedError(args.lso_strategy)

        # update dataset and weights
        data_str, data_enc, data_scores = expr_data.update_dataset_and_weights(
            new_inputs, new_scores, data_str, data_enc, data_scores, model)

        # add new results
        results["opt_points"] += list(new_inputs)
        results["opt_point_properties"] += list(new_scores)
        results["opt_model_version"] += [ite-1] * len(new_inputs)

        # save results
        np.savez_compressed(str(directory / "results.npz"), **results, allow_pickle=True)

    print_flush("=== DONE ({:.3f}s) ===".format(time.time() - start_time))