def experiment(variant, bcq_policies, bcq_buffers, ensemble_params_list, prev_exp_state=None): # Create the multitask replay buffer based on the buffer list train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) # create multi-task environment and sample tasks env = env_producer(variant['domain'], variant['seed']) env_max_action = float(env.action_space.high[0]) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) vae_latent_dim = 2 * action_dim mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[ 'use_next_obs_in_context'] else obs_dim + action_dim + 1 variant['env_max_action'] = env_max_action variant['obs_dim'] = obs_dim variant['action_dim'] = action_dim variant['mlp_enconder_input_size'] = mlp_enconder_input_size # instantiate networks mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200], input_size=mlp_enconder_input_size, output_size=2 * variant['latent_dim']) context_encoder = ProbabilisticContextEncoder(mlp_enconder, variant['latent_dim']) ensemble_predictor = EnsemblePredictor(ensemble_params_list) Qs = FlattenMlp( hidden_sizes=variant['Qs_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=1, ) vae_decoder = VaeDecoder( max_action=env_max_action, hidden_sizes=variant['vae_hidden_sizes'], input_size=obs_dim + vae_latent_dim + variant['latent_dim'], output_size=action_dim, ) perturbation_generator = PerturbationGenerator( max_action=env_max_action, hidden_sizes=variant['perturbation_hidden_sizes'], input_size=obs_dim + action_dim + variant['latent_dim'], output_size=action_dim, ) trainer = SuperQTrainer( ensemble_predictor=ensemble_predictor, num_network_ensemble=variant['num_network_ensemble'], bcq_policies=bcq_policies, std_threshold=variant['std_threshold'], is_combine=variant['is_combine'], nets=[context_encoder, Qs, vae_decoder, perturbation_generator]) path_collector = RemotePathCollector(variant) algorithm = BatchMetaRLAlgorithm( trainer, path_collector, train_buffer, **variant['algo_params'], ) algorithm.to(ptu.device) start_epoch = prev_exp_state['epoch'] + \ 1 if prev_exp_state is not None else 0 # Log the variant logger.log("Variant:") logger.log(json.dumps(dict_to_safe_json(variant), indent=2)) algorithm.train(start_epoch)
bcq_buffers.append(rp_buffer) ray.get(buffer_loader_id_list) start = variant['start'] end = variant['end'] for i in range(start, end): variant['algo_params']['train_goal_id'] = i variant['train_goal'] = train_goals[i] # set up logger variant['log_dir'] = get_log_dir(variant) logger.reset() setup_logger(log_dir=variant['log_dir'], snapshot_gap=100, snapshot_mode="gap") # Log the variant logger.log("Variant:") logger.log(json.dumps(dict_to_safe_json(variant), indent=2)) logger.log(f'Seed: {seed}') set_seed(seed) logger.log(f'Using GPU: {True}') set_gpu_mode(mode=True, gpu_id=0) gt.reset() experiment(variant, bcq_buffers, prev_exp_state=None)