def experiment(variant, data): # make new env, reloading with data['evaluation/env'] seems to make bug eval_env = gym.make("panda-v0", **{"headless": variant["headless"]}) eval_env.seed(variant['seed']) expl_env = eval_env qf1 = data['trainer/qf1'] qf2 = data['trainer/qf2'] target_qf1 = data['trainer/target_qf1'] target_qf2 = data['trainer/target_qf2'] policy = data['trainer/policy'] eval_policy = data["evaluation/policy"] eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train(start_epoch=variant["start_epoch"])
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) qf2 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf1 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf2 = FlattenMlp_Dropout( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[ M, M, ], ) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[750, 750], latent_dim=action_dim * 2, ) eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer, max_size=variant['replay_buffer_size']) trainer = UWACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector( eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) elif 'random-expert' in variant['env_name']: load_hdf5(d4rl.basic_dataset(eval_env), replay_buffer) else: load_hdf5(d4rl.qlearning_dataset(eval_env), replay_buffer) trainer = CQLTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make(variant['env_name']) eval_env = expl_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[ M, M, ], # Making it easier to visualize ) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], latent_dim=action_dim * 2, ) eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer) trainer = BEARTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs']) # variant['algorithm_kwargs']['max_path_length'] = expl_env._max_episode_steps algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg= True, ### SET THIS TO TRUE, BEAR is a Q-learning algorithm **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
file = '/home/tibo/Documents/Prog/Git/d4rl/d4rl_evaluations/bear/data/BEAR-launch/27189/BEAR_launch/27189_2020_08_11_13_10_11_0000--s-0/params.pkl' env_name = 'flow-ring-v0' # Load network Data network_data = torch.load(file) policy = network_data['trainer/policy'] qf1 = network_data['trainer/qf1'] qf2 = network_data['trainer/qf2'] target_qf1 = network_data['trainer/target_qf1'] target_qf2 = network_data['trainer/target_qf2'] vae = network_data['trainer/vae'] eval_env = gym.make(env_name) expl_env = eval_env eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer_size = int(2E4), buffer_filename = None print(replay_buffer_size) replay_buffer = EnvReplayBuffer( replay_buffer_size[0], expl_env, ) # load_hdf5(offline_dataset, replay_buffer, max_size=replay_buffer_size) algorithm_kwargs = dict(
def experiment(variant): eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) cnn_params.update( output_size=256, added_fc_input_size=0, hidden_sizes=[1024, 512], ) policy_obs_processor = CNN(**cnn_params) policy = TanhGaussianPolicy( obs_dim=cnn_params['output_size'], action_dim=action_dim, hidden_sizes=[256, 256, 256], obs_processor=policy_obs_processor, ) if variant['stoch_eval_policy']: eval_policy = policy else: eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector( eval_env, ) observation_key = 'image' replay_buffer = load_data_from_npy_chaining( variant, expl_env, observation_key) trainer = BCTrainer( env=eval_env, policy=policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=True, **variant['algorithm_kwargs'] ) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_name = variant['env_name'] if env_name in ENVS: eval_env = NormalizedBoxEnv(ENVS[env_name]()) expl_env = eval_env else: eval_env = NormalizedBoxEnv(gym.make(variant['env_name'])) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[ M, M, ], ) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[750, 750], latent_dim=action_dim * 2, ) eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) # './data/sac-point-robot/2021_01_04_22_25_16_exp_0000_s_0/offline_buffer_itr_140.hdf5' # './data/sac-point-robot/2021_01_04_22_25_16_exp_0000_s_0/online_buffer.hdf5' file_path = './data/sac-point-robot/2021_01_04_22_25_16_exp_0000_s_0/offline_buffer_itr_140.hdf5' load_hdf5(get_dataset(file_path), replay_buffer, max_size=variant['replay_buffer_size']) trainer = BEARTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = gym.make( variant['env_name'], **{ "headless": variant["headless"], "verbose": variant["verbose"] }) eval_env.seed(variant['seed']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) # TODO: remove with, once figured out the issue! with torch.autograd.set_detect_anomaly(True): algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] # q and policy netwroks qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ).to(ptu.device) # initialize with bc or not if variant['bc_model'] is None: policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ).to(ptu.device) else: bc_model = Mlp( input_size=obs_dim, output_size=action_dim, hidden_sizes=[64, 64], output_activation=F.tanh, ).to(ptu.device) checkpoint = torch.load(variant['bc_model'], map_location=map_location) bc_model.load_state_dict(checkpoint['network_state_dict']) print('Loading bc model: {}'.format(variant['bc_model'])) # policy initialized with bc policy = TanhGaussianPolicy_BC( obs_dim=obs_dim, action_dim=action_dim, mean_network=bc_model, hidden_sizes=[M, M], ).to(ptu.device) # if bonus: define bonus networks if not variant['offline']: bonus_layer_size = variant['bonus_layer_size'] bonus_network = Mlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[bonus_layer_size, bonus_layer_size], output_activation=F.sigmoid, ).to(ptu.device) checkpoint = torch.load(variant['bonus_path'], map_location=map_location) bonus_network.load_state_dict(checkpoint['network_state_dict']) print('Loading bonus model: {}'.format(variant['bonus_path'])) if variant['initialize_Q'] and bonus_layer_size == M: target_qf1.load_state_dict(checkpoint['network_state_dict']) target_qf2.load_state_dict(checkpoint['network_state_dict']) print('Initialize QF1 and QF2 with the bonus model: {}'.format( variant['bonus_path'])) if variant['initialize_Q'] and bonus_layer_size != M: print( ' Size mismatch between Q and bonus- Turining off the initialization' ) # eval_policy = MakeDeterministic(policy) eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) dataset = eval_env.unwrapped.get_dataset() load_hdf5(dataset, replay_buffer, max_size=variant['replay_buffer_size']) if variant['normalize']: obs_mu, obs_std = dataset['observations'].mean( axis=0), dataset['observations'].std(axis=0) bonus_norm_param = [obs_mu, obs_std] else: bonus_norm_param = [None] * 2 # shift the reward if variant['reward_shift'] is not None: rewards_shift_param = min(dataset['rewards']) - variant['reward_shift'] print('.... reward is shifted : {} '.format(rewards_shift_param)) else: rewards_shift_param = None if variant['offline']: trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, rewards_shift_param=rewards_shift_param, **variant['trainer_kwargs']) print('Agent of type offline SAC created') elif variant['bonus'] == 'bonus_add': trainer = SAC_BonusTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, bonus_network=bonus_network, beta=variant['bonus_beta'], use_bonus_critic=variant['use_bonus_critic'], use_bonus_policy=variant['use_bonus_policy'], use_log=variant['use_log'], bonus_norm_param=bonus_norm_param, rewards_shift_param=rewards_shift_param, device=ptu.device, **variant['trainer_kwargs']) print('Agent of type SAC + additive bonus created') elif variant['bonus'] == 'bonus_mlt': trainer = SAC_BonusTrainer_Mlt( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, bonus_network=bonus_network, beta=variant['bonus_beta'], use_bonus_critic=variant['use_bonus_critic'], use_bonus_policy=variant['use_bonus_policy'], bonus_norm_param=bonus_norm_param, rewards_shift_param=rewards_shift_param, device=ptu.device, **variant['trainer_kwargs']) print('Agent of type SAC + multiplicative bonus created') else: raise ValueError('Not implemented error') algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) qf1 = ConcatCNN(**cnn_params) qf2 = ConcatCNN(**cnn_params) target_qf1 = ConcatCNN(**cnn_params) target_qf2 = ConcatCNN(**cnn_params) cnn_params.update( output_size=256, added_fc_input_size=0, hidden_sizes=[1024, 512], ) policy_obs_processor = CNN(**cnn_params) policy = TanhGaussianPolicy( obs_dim=cnn_params['output_size'], action_dim=action_dim, hidden_sizes=[256, 256, 256], obs_processor=policy_obs_processor, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) observation_key = 'image' replay_buffer = load_data_from_npy_chaining(variant, expl_env, observation_key) # Translate 0/1 rewards to +4/+10 rewards. if variant['use_positive_rew']: if set(np.unique(replay_buffer._rewards)).issubset({0, 1}): replay_buffer._rewards = replay_buffer._rewards * 6.0 replay_buffer._rewards = replay_buffer._rewards + 4.0 assert set(np.unique(replay_buffer._rewards)).issubset( set(6.0 * np.array([0, 1]) + 4.0)) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=True, **variant['algorithm_kwargs']) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()