def shape_get_path(k, predict_target: bool, hdims: List[int] = None, latent_dim: int = 2, metric_loss: Optional[str] = None, metric_loss_kw: Dict[str, Any] = None): """ Get path of directory where models will be stored Args: latent_dim: dimension of the latent space k: weight parameter predict_target: whether generative model also predicts target value hdims: latent dims of target MLP predictor metric_loss: metric loss used to structure the embedding metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`) Returns: Path to result dir """ res_path = os.path.join(get_storage_root(), f'logs/train/shapes/shapes-k-{k}') exp_spec = f'id' if latent_dim != 2: exp_spec += f'_z-dim-{latent_dim}' if predict_target: assert hdims is not None exp_spec += '_predy-' + '-'.join(map(str, hdims)) if metric_loss is not None: exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](**metric_loss_kw) res_path = os.path.join(res_path, exp_spec) print('res_path', res_path) return res_path
def get_path(k, ignore_percentile, good_percentile, predict_target: bool, hdims: List[int] = None, metric_loss: Optional[str] = None, metric_loss_kw: Dict[str, Any] = None): """ Get path of directory where models will be stored Args: k: weight parameter ignore_percentile: portion of original equation dataset ignored good_percentile: portion of good original equation dataset included predict_target: whether generative model also predicts target value hdims: latent dims of target MLP predictor metric_loss: metric loss used to structure the embedding metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`) Returns: Path to result dir """ res_path = os.path.join(get_storage_root(), f'logs/train/expr/torch/expr-k-{k}') exp_spec = f'ignore_perc-{ignore_percentile}_good_perc-{good_percentile}' if predict_target: assert hdims is not None exp_spec += '_predy-' + '-'.join(map(str, hdims)) if metric_loss is not None: exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](**metric_loss_kw) res_path = os.path.join(res_path, exp_spec) print('res_path', res_path) return res_path
def get_path(k, ignore_percentile, good_percentile, predict_target: bool, n_max_epochs: int, beta_final: float, beta_target_pred_loss: float, beta_metric_loss: float, latent_dim: int = 25, hdims: List[int] = None, metric_loss: Optional[str] = None, metric_loss_kw: Dict[str, Any] = None): """ Get path of directory where models will be stored Args: n_max_epochs: number of training epochs k: weight parameter ignore_percentile: portion of original equation dataset ignored good_percentile: portion of good original equation dataset included beta_metric_loss: weight of the metric loss added to the ELBO beta_final: weight of the KL in the ELBO beta_target_pred_loss: weight of the target prediction loss added to the ELBO latent_dim: dimension of the latent space predict_target: whether generative model also predicts target value hdims: latent dims of target MLP predictor metric_loss: metric loss used to structure the embedding metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`) Returns: Path to result dir """ res_path = os.path.join(get_storage_root(), f'logs/train/expr/torch/expr-k-{k}') exp_spec = f'ignore_perc-{ignore_percentile}_good_perc-{good_percentile}_epochs-{n_max_epochs}' if latent_dim != 25: exp_spec += f'_z-dim-{latent_dim}' if predict_target: assert hdims is not None exp_spec += '_predy-' + '-'.join(map(str, hdims)) if beta_target_pred_loss != 1: exp_spec += f'-b-{float(beta_target_pred_loss):g}' if metric_loss is not None: exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id']( **metric_loss_kw) if beta_metric_loss != 1: exp_spec += f'-b-{float(beta_metric_loss):g}' if beta_final != 0.005: exp_spec += f'-bkl-{beta_final}' res_path = os.path.join(res_path, exp_spec) print('res_path', res_path) return res_path
def topology_get_path(k, predict_target: bool, n_max_epochs: int, beta_final: float, beta_target_pred_loss: float, beta_metric_loss: float, latent_dim: int = 25, hdims: List[int] = None, metric_loss: Optional[str] = None, metric_loss_kw: Dict[str, Any] = None, use_binary_data: bool = False): """ Get path of directory where models will be stored Args: k: weight parameter beta_metric_loss: weight of the metric loss added to the ELBO beta_final: weight of the KL in the ELBO beta_target_pred_loss: weight of the target prediction loss added to the ELBO latent_dim: dimension of the latent space predict_target: whether generative model also predicts target value hdims: latent dims of target MLP predictor metric_loss: metric loss used to structure the embedding metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`) use_binary_data: use binarized data Returns: Path to result dir """ res_path = os.path.join(get_storage_root(), f'logs/train/topology/k-{k}') exp_spec = f'id' if latent_dim != 2: exp_spec += f'-z_dim_{latent_dim}' if predict_target: assert hdims is not None exp_spec += '-predy_' + '_'.join(map(str, hdims)) if beta_target_pred_loss != 1: exp_spec += f'-b_{float(beta_target_pred_loss):g}' if metric_loss is not None: exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id']( **metric_loss_kw) if beta_metric_loss != 1: exp_spec += f'-b_{float(beta_metric_loss):g}' exp_spec += f"-bkl_{beta_final}" if use_binary_data: exp_spec += '-binary_data' res_path = os.path.join(res_path, exp_spec) print('res_path', res_path) return res_path
def shape_get_path(k, predict_target: bool, n_max_epochs: int, beta_final: float, beta_metric_loss: float, beta_target_pred_loss: float, hdims: List[int] = None, metric_loss: Optional[str] = None, metric_loss_kw: Dict[str, Any] = None, latent_dim: int = 2): """ Get path of directory where models will be stored Args: k: weight parameter predict_target: whether generative model also predicts target value beta_metric_loss: weight of the metric loss added to the ELBO beta_final: weight of the KL in the ELBO beta_target_pred_loss: weight of the target prediction loss added to the ELBO hdims: latent dims of target MLP predictor metric_loss: metric loss used to structure the embedding metric_loss_kw: kwargs for `metric_loss` (see `METRIC_LOSSES`) Returns: Path to result dir """ res_path = os.path.join(get_storage_root(), f'logs/train/shapes/shapes-k-{k}') exp_spec = f'id' if latent_dim != 2: exp_spec += f'_z-dim-{latent_dim}' if predict_target: assert hdims is not None exp_spec += '_predy-' + '-'.join(map(str, hdims)) if beta_target_pred_loss != 1: exp_spec += f'-b-{float(beta_target_pred_loss):g}' if metric_loss is not None: exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id']( **metric_loss_kw) if beta_metric_loss != 1: exp_spec += f'-b-{float(beta_metric_loss):g}' if beta_final != 10.0: exp_spec += f'-bkl-{beta_final}' res_path = os.path.join(res_path, exp_spec) print('res_path', res_path) return res_path
def main(): # parse arguments parser = argparse.ArgumentParser() parser.register('type', list, parse_list) parser.register('type', dict, parse_dict) parser = add_main_args(parser) parser = WeightedJTNNDataset.add_model_specific_args(parser) parser = DataWeighter.add_weight_args(parser) parser = add_gp_fit_args(parser) parser.add_argument( "--input_wp", action='store_true', help="Whether to apply input warping" ) parser.add_argument( "--predict_target", action='store_true', help="Generative model predicts target value", ) parser.add_argument( "--target_predictor_hdims", type=list, default=None, help="Hidden dimensions of MLP predicting target values", ) parser.add_argument( "--latent_dim", type=int, default=56, help="Hidden dimension the latent space", ) parser.add_argument( "--use_pretrained", action='store_true', help="True if using pretrained VAE model", ) parser.add_argument( "--pretrained_model_id", type=str, default='vanilla', help="id of the pretrained VAE model used (should be aligned with the pretrained model file)", ) vae_group = parser.add_argument_group("Metric learning") vae_group.add_argument( "--metric_loss", type=str, help="Metric loss to add to VAE loss during training of the generative model to get better " "structured latent space (see `METRIC_LOSSES`), one of ['contrastive', 'triplet', 'log_ratio', 'infob']", ) vae_group.add_argument( "--metric_loss_kw", type=dict, default=None, help="Threshold parameter for contrastive loss, one of [{'threshold':.1}, {'threshold':.1,'margin':1}]", ) vae_group.add_argument( "--beta_target_pred_loss", type=float, default=1., help="Weight of the target_prediction loss added in the ELBO", ) vae_group.add_argument( "--beta_metric_loss", type=float, default=1., help="Weight of the metric loss added in the ELBO", ) vae_group.add_argument( "--beta_final", type=float, help="Weight of the kl loss in the ELBO", ) vae_group.add_argument( "--semi_supervised", action='store_true', help="Start BO from VAE trained with unlabelled data.", ) vae_group.add_argument( "--n_init_bo_points", type=int, default=None, help="Number of data points to use at the start of the BO if using semi-supervised training of the VAE." "(We need at least SOME data to fit the GP(s) etc.)", ) vae_group.add_argument( "--n_test_points", type=int, default=2500, help="Number of held-out data points to use for gp fit assessment" ) vae_group.add_argument( "--use_decoded", action='store_true', help="whether to use f(x_test) or f(q(p(x_test))) as test target for the gp" ) args = parser.parse_args() args.train_path = os.path.join(ROOT_PROJECT, args.train_path) args.val_path = os.path.join(ROOT_PROJECT, args.val_path) args.vocab_file = os.path.join(ROOT_PROJECT, args.vocab_file) args.property_file = os.path.join(ROOT_PROJECT, args.property_file) if args.pretrained_model_file is not None: args.pretrained_model_file = os.path.join(get_storage_root(), args.pretrained_model_file) else: raise ValueError("does not support this yet, use pretrained model, please.") # create result directory result_dir = get_path( weight_type=args.weight_type, k=args.rank_weight_k, predict_target=args.predict_target, latent_dim=args.latent_dim, hdims=args.target_predictor_hdims, metric_loss=args.metric_loss, metric_loss_kw=args.metric_loss_kw, input_wp=args.input_wp, seed=args.seed, beta_metric_loss=args.beta_metric_loss, beta_target_pred_loss=args.beta_target_pred_loss, beta_kl_final=args.beta_final, use_pretrained=args.use_pretrained, n_init_retrain_epochs=args.n_init_retrain_epochs, semi_supervised=args.semi_supervised, n_init_bo_points=args.n_init_bo_points, pretrained_model_id=args.pretrained_model_id, batch_size=args.batch_size, use_decoded=args.use_decoded, n_test_points=args.n_test_points, ) print(f'result dir: {result_dir}') os.makedirs(result_dir, exist_ok=True) save_w_pickle(args, result_dir, 'args.pkl') logs = '' exc: Optional[Exception] = None try: main_aux(args, result_dir=result_dir) except Exception as e: logs = traceback.format_exc() exc = e f = open(os.path.join(result_dir, 'logs.txt'), "a") f.write('\n' + '--------' * 10) f.write(logs) f.write('\n' + '--------' * 10) f.close() if exc is not None: raise exc
def get_root_path(weight_type, k, predict_target, hdims, latent_dim: int, beta_kl_final: float, beta_metric_loss: float, beta_target_pred_loss: float, metric_loss: str, metric_loss_kw: Dict[str, Any], input_wp: bool, use_pretrained: bool, pretrained_model_id: str, batch_size: int, n_init_retrain_epochs: float, n_test_points: int, use_decoded: float, semi_supervised: Optional[bool], n_init_bo_points: Optional[int] ): """ Get result root result path (associated directory will contain results for all seeds) Args: batch_size: batch size used for vae training pretrained_model_id: id of the pretrained model weight_type: type of weighting used for retraining k: weighting parameter predict_target: whether generative model also predicts target value hdims: latent dims of target MLP predictor metric_loss: metric loss used to structure embedding space metric_loss_kw: kwargs for metric loss beta_metric_loss: weight of the metric loss added to the ELBO beta_kl_final: weight of the KL in the ELBO beta_target_pred_loss: weight of the target prediction loss added to the ELBO latent_dim: dimension of the latent space use_pretrained: Whether or not to use a pretrained VAE model n_init_retrain_epochs: number of retraining epochs to do before using VAE model in BO semi_supervised: whether or not to start BO from VAE trained with unlabelled data n_init_bo_points: number of initial labelled points considered for BO with semi-supervised setting n_test_points: number of test points on which gp fit will be evaluated use_decoded: whether to use f(x_test) or f(q(p(x_test))) as target for the gp Returns: path to result dir """ result_path = os.path.join( get_storage_root(), f"logs/gp/chem/{weight_type}/k_{k}/") exp_spec = f"gp-fit" exp_spec += f'-z_dim_{latent_dim}' exp_spec += f"-init_{n_init_retrain_epochs:g}" if predict_target: assert hdims is not None exp_spec += '-predy_' + '_'.join(map(str, hdims)) exp_spec += f'-b_{float(beta_target_pred_loss):g}' if metric_loss is not None: exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id'](**metric_loss_kw) exp_spec += f'-b_{float(beta_metric_loss):g}' if input_wp: exp_spec += f'-iw' exp_spec += f'-bkl_{beta_kl_final}' if semi_supervised: assert n_init_bo_points is not None, n_init_bo_points exp_spec += "-semi_supervised" exp_spec += f"-n-init-{n_init_bo_points}" if use_pretrained: if pretrained_model_id != 'vanilla': exp_spec += f'_pretrain-{pretrained_model_id}' exp_spec += f'_bs-{batch_size}' result_path = os.path.join(result_path, exp_spec, f"{n_test_points}" + ("-dec" if use_decoded else "")) return result_path
def get_root_path(lso_strategy: str, weight_type, k, r, predict_target, hdims, latent_dim: int, beta_kl_final: float, beta_metric_loss: float, beta_target_pred_loss: float, metric_loss: str, metric_loss_kw: Dict[str, Any], acq_func_id: str, acq_func_kwargs: Dict[str, Any], input_wp: bool, random_search_type: Optional[str], use_pretrained: bool, pretrained_model_id: str, batch_size: int, n_init_retrain_epochs: float, semi_supervised: Optional[bool], n_init_bo_points: Optional[int]): """ Get result root result path (associated directory will contain results for all seeds) Args: batch_size: batch size used for vae training pretrained_model_id: id of the pretrained model lso_strategy: type of optimisation weight_type: type of weighting used for retraining k: weighting parameter r: period of retraining predict_target: whether generative model also predicts target value hdims: latent dims of target MLP predictor metric_loss: metric loss used to structure embedding space metric_loss_kw: kwargs for metric loss acq_func_id: name of acquisition function acq_func_kwargs: acquisition function kwargs random_search_type: random search specific strategy beta_metric_loss: weight of the metric loss added to the ELBO beta_kl_final: weight of the KL in the ELBO beta_target_pred_loss: weight of the target prediction loss added to the ELBO latent_dim: dimension of the latent space use_pretrained: Whether or not to use a pretrained VAE model n_init_retrain_epochs: number of retraining epochs to do before using VAE model in BO semi_supervised: whether or not to start BO from VAE trained with unlabelled data n_init_bo_points: number of initial labelled points considered for BO with semi-supervised training Returns: path to result dir """ result_path = os.path.join(get_storage_root(), f"logs/opt/chem/{weight_type}/k_{k}/r_{r}") exp_spec = f"paper-mol" exp_spec += f'-z_dim_{latent_dim}' exp_spec += f"-init_{n_init_retrain_epochs:g}" if predict_target: assert hdims is not None exp_spec += '-predy_' + '_'.join(map(str, hdims)) exp_spec += f'-b_{float(beta_target_pred_loss):g}' if metric_loss is not None: exp_spec += '-' + METRIC_LOSSES[metric_loss]['exp_metric_id']( **metric_loss_kw) exp_spec += f'-b_{float(beta_metric_loss):g}' exp_spec += f'-bkl_{beta_kl_final}' if semi_supervised: assert n_init_bo_points is not None, n_init_bo_points exp_spec += "-semi_supervised" exp_spec += f"-n-init-{n_init_bo_points}" if use_pretrained: exp_spec += f'_pretrain-{pretrained_model_id}' else: exp_spec += f'_scratch' if batch_size != 32: exp_spec += f'_bs-{batch_size}' if lso_strategy == 'opt': acq_func_spec = '' if acq_func_id != 'ExpectedImprovement': acq_func_spec += acq_func_id acq_func_spec += f"{'_inwp_' if input_wp else str(input_wp)}" \ # if 'ErrorAware' in acq_func_id and cost_aware_gamma_sched is not None: # acq_func_spec += f"_sch-{cost_aware_gamma_sched}" if len(acq_func_kwargs) > 0: acq_func_spec += f'_{str_dict(acq_func_kwargs)}' result_path = os.path.join(result_path, exp_spec, acq_func_spec) elif lso_strategy == 'sample': raise NotImplementedError('Sample lso strategy not supported') # result_path = os.path.join(result_path, exp_spec, f'latent-sample') elif lso_strategy == 'random_search': base = f'latent-random-search' if random_search_type == 'sobol': base += '-sobol' else: assert random_search_type is None, f'{random_search_type} is invalid' result_path = os.path.join(result_path, exp_spec, base) else: raise ValueError( f'{lso_strategy} not supported: try `opt`, `sample`...') return result_path
from weighted_retraining.weighted_retraining.chem.chem_data import WeightedJTNNDataset from weighted_retraining.weighted_retraining import utils if __name__ == "__main__": # Create arg parser parser = argparse.ArgumentParser() parser = JTVAE.add_model_specific_args(parser) parser = WeightedJTNNDataset.add_model_specific_args(parser) parser = utils.DataWeighter.add_weight_args(parser) utils.add_default_trainer_args(parser, default_root=None) # Parse arguments hparams = parser.parse_args() hparams.root_dir = os.path.join(get_storage_root(), hparams.root_dir) pl.seed_everything(hparams.seed) print_flush(' '.join(sys.argv[1:])) # Create data datamodule = WeightedJTNNDataset(hparams, utils.DataWeighter(hparams)) datamodule.setup("fit") # Load model model = JTVAE(hparams, datamodule.vocab) checkpoint_callback = pl.callbacks.ModelCheckpoint( period=1, monitor="loss/val", save_top_k=1, save_last=True, mode='min' )