Example #1
0
def shape_get_path(k, predict_target: bool, hdims: List[int] = None, latent_dim: int = 2,
                   metric_loss: Optional[str] = None, metric_loss_kw: Dict[str, Any] = None):
    """ Get path of directory where models will be stored

    Args:
        latent_dim: dimension of the latent space
        k: weight parameter
        predict_target: whether generative model also predicts target value
        hdims: latent dims of target MLP predictor
        metric_loss: metric loss used to structure the embedding
        metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`)

    Returns:
        Path to result dir
    """
    res_path = os.path.join(get_storage_root(), f'logs/train/shapes/shapes-k-{k}')
    exp_spec = f'id'
    if latent_dim != 2:
        exp_spec += f'_z-dim-{latent_dim}'
    if predict_target:
        assert hdims is not None
        exp_spec += '_predy-' + '-'.join(map(str, hdims))
    if metric_loss is not None:
        exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](**metric_loss_kw)
    res_path = os.path.join(res_path, exp_spec)
    print('res_path', res_path)
    return res_path
def get_path(k, ignore_percentile, good_percentile, predict_target: bool, hdims: List[int] = None,
             metric_loss: Optional[str] = None, metric_loss_kw: Dict[str, Any] = None):
    """ Get path of directory where models will be stored

    Args:
        k: weight parameter
        ignore_percentile: portion of original equation dataset ignored
        good_percentile: portion of good original equation dataset included
        predict_target: whether generative model also predicts target value
        hdims: latent dims of target MLP predictor
        metric_loss: metric loss used to structure the embedding
        metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`)

    Returns:
        Path to result dir
    """
    res_path = os.path.join(get_storage_root(), f'logs/train/expr/torch/expr-k-{k}')
    exp_spec = f'ignore_perc-{ignore_percentile}_good_perc-{good_percentile}'
    if predict_target:
        assert hdims is not None
        exp_spec += '_predy-' + '-'.join(map(str, hdims))
    if metric_loss is not None:
        exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](**metric_loss_kw)
    res_path = os.path.join(res_path, exp_spec)
    print('res_path', res_path)
    return res_path
Example #3
0
def get_path(k,
             ignore_percentile,
             good_percentile,
             predict_target: bool,
             n_max_epochs: int,
             beta_final: float,
             beta_target_pred_loss: float,
             beta_metric_loss: float,
             latent_dim: int = 25,
             hdims: List[int] = None,
             metric_loss: Optional[str] = None,
             metric_loss_kw: Dict[str, Any] = None):
    """ Get path of directory where models will be stored

    Args:
        n_max_epochs: number of training epochs
        k: weight parameter
        ignore_percentile: portion of original equation dataset ignored
        good_percentile: portion of good original equation dataset included
        beta_metric_loss: weight of the metric loss added to the ELBO
        beta_final: weight of the KL in the ELBO
        beta_target_pred_loss: weight of the target prediction loss added to the ELBO
        latent_dim: dimension of the latent space
        predict_target: whether generative model also predicts target value
        hdims: latent dims of target MLP predictor
        metric_loss: metric loss used to structure the embedding
        metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`)

    Returns:
        Path to result dir
    """
    res_path = os.path.join(get_storage_root(),
                            f'logs/train/expr/torch/expr-k-{k}')
    exp_spec = f'ignore_perc-{ignore_percentile}_good_perc-{good_percentile}_epochs-{n_max_epochs}'
    if latent_dim != 25:
        exp_spec += f'_z-dim-{latent_dim}'
    if predict_target:
        assert hdims is not None
        exp_spec += '_predy-' + '-'.join(map(str, hdims))
        if beta_target_pred_loss != 1:
            exp_spec += f'-b-{float(beta_target_pred_loss):g}'
    if metric_loss is not None:
        exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](
            **metric_loss_kw)
        if beta_metric_loss != 1:
            exp_spec += f'-b-{float(beta_metric_loss):g}'
    if beta_final != 0.005:
        exp_spec += f'-bkl-{beta_final}'
    res_path = os.path.join(res_path, exp_spec)
    print('res_path', res_path)
    return res_path
def topology_get_path(k,
                      predict_target: bool,
                      n_max_epochs: int,
                      beta_final: float,
                      beta_target_pred_loss: float,
                      beta_metric_loss: float,
                      latent_dim: int = 25,
                      hdims: List[int] = None,
                      metric_loss: Optional[str] = None,
                      metric_loss_kw: Dict[str, Any] = None,
                      use_binary_data: bool = False):
    """ Get path of directory where models will be stored

    Args:
        k: weight parameter
        beta_metric_loss: weight of the metric loss added to the ELBO
        beta_final: weight of the KL in the ELBO
        beta_target_pred_loss: weight of the target prediction loss added to the ELBO
        latent_dim: dimension of the latent space
        predict_target: whether generative model also predicts target value
        hdims: latent dims of target MLP predictor
        metric_loss: metric loss used to structure the embedding
        metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`)
use_binary_data: use binarized data

    Returns:
        Path to result dir
    """
    res_path = os.path.join(get_storage_root(), f'logs/train/topology/k-{k}')
    exp_spec = f'id'
    if latent_dim != 2:
        exp_spec += f'-z_dim_{latent_dim}'
    if predict_target:
        assert hdims is not None
        exp_spec += '-predy_' + '_'.join(map(str, hdims))
        if beta_target_pred_loss != 1:
            exp_spec += f'-b_{float(beta_target_pred_loss):g}'
    if metric_loss is not None:
        exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](
            **metric_loss_kw)
        if beta_metric_loss != 1:
            exp_spec += f'-b_{float(beta_metric_loss):g}'
    exp_spec += f"-bkl_{beta_final}"
    if use_binary_data:
        exp_spec += '-binary_data'
    res_path = os.path.join(res_path, exp_spec)
    print('res_path', res_path)
    return res_path
def shape_get_path(k,
                   predict_target: bool,
                   n_max_epochs: int,
                   beta_final: float,
                   beta_metric_loss: float,
                   beta_target_pred_loss: float,
                   hdims: List[int] = None,
                   metric_loss: Optional[str] = None,
                   metric_loss_kw: Dict[str, Any] = None,
                   latent_dim: int = 2):
    """ Get path of directory where models will be stored

    Args:
        k: weight parameter
        predict_target: whether generative model also predicts target value
        beta_metric_loss: weight of the metric loss added to the ELBO
        beta_final: weight of the KL in the ELBO
        beta_target_pred_loss: weight of the target prediction loss added to the ELBO
        hdims: latent dims of target MLP predictor
        metric_loss: metric loss used to structure the embedding
        metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`)

    Returns:
        Path to result dir
    """
    res_path = os.path.join(get_storage_root(),
                            f'logs/train/shapes/shapes-k-{k}')
    exp_spec = f'id'
    if latent_dim != 2:
        exp_spec += f'_z-dim-{latent_dim}'
    if predict_target:
        assert hdims is not None
        exp_spec += '_predy-' + '-'.join(map(str, hdims))
        if beta_target_pred_loss != 1:
            exp_spec += f'-b-{float(beta_target_pred_loss):g}'
    if metric_loss is not None:
        exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](
            **metric_loss_kw)
        if beta_metric_loss != 1:
            exp_spec += f'-b-{float(beta_metric_loss):g}'
    if beta_final != 10.0:
        exp_spec += f'-bkl-{beta_final}'
    res_path = os.path.join(res_path, exp_spec)
    print('res_path', res_path)
    return res_path
Example #6
0
def main():
    # parse arguments
    parser = argparse.ArgumentParser()
    parser.register('type', list, parse_list)
    parser.register('type', dict, parse_dict)

    parser = add_main_args(parser)
    parser = WeightedJTNNDataset.add_model_specific_args(parser)
    parser = DataWeighter.add_weight_args(parser)
    parser = add_gp_fit_args(parser)

    parser.add_argument(
        "--input_wp",
        action='store_true',
        help="Whether to apply input warping"
    )
    parser.add_argument(
        "--predict_target",
        action='store_true',
        help="Generative model predicts target value",
    )
    parser.add_argument(
        "--target_predictor_hdims",
        type=list,
        default=None,
        help="Hidden dimensions of MLP predicting target values",
    )
    parser.add_argument(
        "--latent_dim",
        type=int,
        default=56,
        help="Hidden dimension the latent space",
    )
    parser.add_argument(
        "--use_pretrained",
        action='store_true',
        help="True if using pretrained VAE model",
    )
    parser.add_argument(
        "--pretrained_model_id",
        type=str,
        default='vanilla',
        help="id of the pretrained VAE model used (should be aligned with the pretrained model file)",
    )

    vae_group = parser.add_argument_group("Metric learning")
    vae_group.add_argument(
        "--metric_loss",
        type=str,
        help="Metric loss to add to VAE loss during training of the generative model to get better "
             "structured latent space (see `METRIC_LOSSES`), one of ['contrastive', 'triplet', 'log_ratio', 'infob']",
    )
    vae_group.add_argument(
        "--metric_loss_kw",
        type=dict,
        default=None,
        help="Threshold parameter for contrastive loss, one of [{'threshold':.1}, {'threshold':.1,'margin':1}]",
    )
    vae_group.add_argument(
        "--beta_target_pred_loss",
        type=float,
        default=1.,
        help="Weight of the target_prediction loss added in the ELBO",
    )
    vae_group.add_argument(
        "--beta_metric_loss",
        type=float,
        default=1.,
        help="Weight of the metric loss added in the ELBO",
    )
    vae_group.add_argument(
        "--beta_final",
        type=float,
        help="Weight of the kl loss in the ELBO",
    )
    vae_group.add_argument(
        "--semi_supervised",
        action='store_true',
        help="Start BO from VAE trained with unlabelled data.",
    )
    vae_group.add_argument(
        "--n_init_bo_points",
        type=int,
        default=None,
        help="Number of data points to use at the start of the BO if using semi-supervised training of the VAE."
             "(We need at least SOME data to fit the GP(s) etc.)",
    )

    vae_group.add_argument(
        "--n_test_points",
        type=int,
        default=2500,
        help="Number of held-out data points to use for gp fit assessment"
    )

    vae_group.add_argument(
        "--use_decoded",
        action='store_true',
        help="whether to use f(x_test) or f(q(p(x_test))) as test target for the gp"
    )
    args = parser.parse_args()

    args.train_path = os.path.join(ROOT_PROJECT, args.train_path)
    args.val_path = os.path.join(ROOT_PROJECT, args.val_path)
    args.vocab_file = os.path.join(ROOT_PROJECT, args.vocab_file)
    args.property_file = os.path.join(ROOT_PROJECT, args.property_file)

    if args.pretrained_model_file is not None:
        args.pretrained_model_file = os.path.join(get_storage_root(), args.pretrained_model_file)
    else:
        raise ValueError("does not support this yet, use pretrained model, please.")

    # create result directory
    result_dir = get_path(
        weight_type=args.weight_type,
        k=args.rank_weight_k,
        predict_target=args.predict_target,
        latent_dim=args.latent_dim,
        hdims=args.target_predictor_hdims,
        metric_loss=args.metric_loss,
        metric_loss_kw=args.metric_loss_kw,
        input_wp=args.input_wp,
        seed=args.seed,
        beta_metric_loss=args.beta_metric_loss,
        beta_target_pred_loss=args.beta_target_pred_loss,
        beta_kl_final=args.beta_final,
        use_pretrained=args.use_pretrained,
        n_init_retrain_epochs=args.n_init_retrain_epochs,
        semi_supervised=args.semi_supervised,
        n_init_bo_points=args.n_init_bo_points,
        pretrained_model_id=args.pretrained_model_id,
        batch_size=args.batch_size,
        use_decoded=args.use_decoded,
        n_test_points=args.n_test_points,
    )
    print(f'result dir: {result_dir}')
    os.makedirs(result_dir, exist_ok=True)
    save_w_pickle(args, result_dir, 'args.pkl')
    logs = ''
    exc: Optional[Exception] = None
    try:
        main_aux(args, result_dir=result_dir)
    except Exception as e:
        logs = traceback.format_exc()
        exc = e
    f = open(os.path.join(result_dir, 'logs.txt'), "a")
    f.write('\n' + '--------' * 10)
    f.write(logs)
    f.write('\n' + '--------' * 10)
    f.close()
    if exc is not None:
        raise exc
Example #7
0
def get_root_path(weight_type, k,
                  predict_target, hdims, latent_dim: int, beta_kl_final: float, beta_metric_loss: float,
                  beta_target_pred_loss: float,
                  metric_loss: str, metric_loss_kw: Dict[str, Any],
                  input_wp: bool,
                  use_pretrained: bool, pretrained_model_id: str, batch_size: int,
                  n_init_retrain_epochs: float, n_test_points: int, use_decoded: float,
                  semi_supervised: Optional[bool], n_init_bo_points: Optional[int]
                  ):
    """ Get result root result path (associated directory will contain results for all seeds)
    Args:
        batch_size: batch size used for vae training
        pretrained_model_id: id of the pretrained model
        weight_type: type of weighting used for retraining
        k: weighting parameter
        predict_target: whether generative model also predicts target value
        hdims: latent dims of target MLP predictor
        metric_loss: metric loss used to structure embedding space
        metric_loss_kw: kwargs for metric loss
        beta_metric_loss: weight of the metric loss added to the ELBO
        beta_kl_final: weight of the KL in the ELBO
        beta_target_pred_loss: weight of the target prediction loss added to the ELBO
        latent_dim: dimension of the latent space
        use_pretrained: Whether or not to use a pretrained VAE model
        n_init_retrain_epochs: number of retraining epochs to do before using VAE model in BO
        semi_supervised: whether or not to start BO from VAE trained with unlabelled data
        n_init_bo_points: number of initial labelled points considered for BO with semi-supervised setting
        n_test_points: number of test points on which gp fit will be evaluated
        use_decoded: whether to use f(x_test) or f(q(p(x_test))) as target for the gp

    Returns:
        path to result dir
    """
    result_path = os.path.join(
        get_storage_root(),
        f"logs/gp/chem/{weight_type}/k_{k}/")

    exp_spec = f"gp-fit"
    exp_spec += f'-z_dim_{latent_dim}'
    exp_spec += f"-init_{n_init_retrain_epochs:g}"
    if predict_target:
        assert hdims is not None
        exp_spec += '-predy_' + '_'.join(map(str, hdims))
        exp_spec += f'-b_{float(beta_target_pred_loss):g}'
    if metric_loss is not None:
        exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](**metric_loss_kw)
        exp_spec += f'-b_{float(beta_metric_loss):g}'
    if input_wp:
        exp_spec += f'-iw'
    exp_spec += f'-bkl_{beta_kl_final}'
    if semi_supervised:
        assert n_init_bo_points is not None, n_init_bo_points
        exp_spec += "-semi_supervised"
        exp_spec += f"-n-init-{n_init_bo_points}"
    if use_pretrained:
        if pretrained_model_id != 'vanilla':
            exp_spec += f'_pretrain-{pretrained_model_id}'
    exp_spec += f'_bs-{batch_size}'
    result_path = os.path.join(result_path, exp_spec, f"{n_test_points}" + ("-dec" if use_decoded else ""))

    return result_path
def get_root_path(lso_strategy: str, weight_type, k, r, predict_target, hdims,
                  latent_dim: int, beta_kl_final: float,
                  beta_metric_loss: float, beta_target_pred_loss: float,
                  metric_loss: str, metric_loss_kw: Dict[str, Any],
                  acq_func_id: str, acq_func_kwargs: Dict[str, Any],
                  input_wp: bool, random_search_type: Optional[str],
                  use_pretrained: bool, pretrained_model_id: str,
                  batch_size: int, n_init_retrain_epochs: float,
                  semi_supervised: Optional[bool],
                  n_init_bo_points: Optional[int]):
    """ Get result root result path (associated directory will contain results for all seeds)
    Args:
        batch_size: batch size used for vae training
        pretrained_model_id: id of the pretrained model
        lso_strategy: type of optimisation
        weight_type: type of weighting used for retraining
        k: weighting parameter
        r: period of retraining
        predict_target: whether generative model also predicts target value
        hdims: latent dims of target MLP predictor
        metric_loss: metric loss used to structure embedding space
        metric_loss_kw: kwargs for metric loss
        acq_func_id: name of acquisition function
        acq_func_kwargs: acquisition function kwargs
        random_search_type: random search specific strategy
        beta_metric_loss: weight of the metric loss added to the ELBO
        beta_kl_final: weight of the KL in the ELBO
        beta_target_pred_loss: weight of the target prediction loss added to the ELBO
        latent_dim: dimension of the latent space
        use_pretrained: Whether or not to use a pretrained VAE model
        n_init_retrain_epochs: number of retraining epochs to do before using VAE model in BO
        semi_supervised: whether or not to start BO from VAE trained with unlabelled data
        n_init_bo_points: number of initial labelled points considered for BO with semi-supervised training

    Returns:
        path to result dir
    """
    result_path = os.path.join(get_storage_root(),
                               f"logs/opt/chem/{weight_type}/k_{k}/r_{r}")

    exp_spec = f"paper-mol"
    exp_spec += f'-z_dim_{latent_dim}'
    exp_spec += f"-init_{n_init_retrain_epochs:g}"
    if predict_target:
        assert hdims is not None
        exp_spec += '-predy_' + '_'.join(map(str, hdims))
        exp_spec += f'-b_{float(beta_target_pred_loss):g}'
    if metric_loss is not None:
        exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](
            **metric_loss_kw)
        exp_spec += f'-b_{float(beta_metric_loss):g}'
    exp_spec += f'-bkl_{beta_kl_final}'
    if semi_supervised:
        assert n_init_bo_points is not None, n_init_bo_points
        exp_spec += "-semi_supervised"
        exp_spec += f"-n-init-{n_init_bo_points}"
    if use_pretrained:
        exp_spec += f'_pretrain-{pretrained_model_id}'
    else:
        exp_spec += f'_scratch'
    if batch_size != 32:
        exp_spec += f'_bs-{batch_size}'

    if lso_strategy == 'opt':
        acq_func_spec = ''
        if acq_func_id != 'ExpectedImprovement':
            acq_func_spec += acq_func_id

        acq_func_spec += f"{'_inwp_' if input_wp else str(input_wp)}" \
            # if 'ErrorAware' in acq_func_id and cost_aware_gamma_sched is not None:

        #     acq_func_spec += f"_sch-{cost_aware_gamma_sched}"
        if len(acq_func_kwargs) > 0:
            acq_func_spec += f'_{str_dict(acq_func_kwargs)}'
        result_path = os.path.join(result_path, exp_spec, acq_func_spec)

    elif lso_strategy == 'sample':
        raise NotImplementedError('Sample lso strategy not supported')
        # result_path = os.path.join(result_path, exp_spec, f'latent-sample')
    elif lso_strategy == 'random_search':
        base = f'latent-random-search'
        if random_search_type == 'sobol':
            base += '-sobol'
        else:
            assert random_search_type is None, f'{random_search_type} is invalid'
        result_path = os.path.join(result_path, exp_spec, base)
    else:
        raise ValueError(
            f'{lso_strategy} not supported: try `opt`, `sample`...')
    return result_path
Example #9
0
from weighted_retraining.weighted_retraining.chem.chem_data import WeightedJTNNDataset
from weighted_retraining.weighted_retraining import utils

if __name__ == "__main__":

    # Create arg parser
    parser = argparse.ArgumentParser()
    parser = JTVAE.add_model_specific_args(parser)
    parser = WeightedJTNNDataset.add_model_specific_args(parser)
    parser = utils.DataWeighter.add_weight_args(parser)
    utils.add_default_trainer_args(parser, default_root=None)

    # Parse arguments
    hparams = parser.parse_args()

    hparams.root_dir = os.path.join(get_storage_root(), hparams.root_dir)

    pl.seed_everything(hparams.seed)
    print_flush(' '.join(sys.argv[1:]))

    # Create data
    datamodule = WeightedJTNNDataset(hparams, utils.DataWeighter(hparams))
    datamodule.setup("fit")

    # Load model
    model = JTVAE(hparams, datamodule.vocab)

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        period=1, monitor="loss/val", save_top_k=1,
        save_last=True, mode='min'
    )