def __init__(self, task={}, n_tasks=2, randomize_tasks=False): distributions = [-1, 1] self.tasks = [{ 'distributions': distribution } for distribution in distributions] self._task = task self._goal_dir = task.get('distributions', 1) # just 1 self._goal = self._goal_dir self.env_cheetah_0 = gym.make('halfcheetah-random-v0') self.env_cheetah_1 = gym.make('halfcheetah-expert-v0') self.d0 = d4rl.qlearning_dataset(self.env_cheetah_0) self.d1 = d4rl.qlearning_dataset(self.env_cheetah_1) self.d2 = self.env_cheetah_0.get_dataset() super(HalfCheetahDistEnv, self).__init__()
def main(args): np.random.seed(args.seed) tf.set_random_seed(args.seed) tester.configure(task_name='model_learn', private_config_path=os.path.join(get_package_path(), 'rla_config.yaml'), run_file='train_model_offline.py', log_root=get_package_path()) tester.log_files_gen() tester.print_args() env = gym.make('{}-{}-v0'.format(args.env, args.quality)) dataset = d4rl.qlearning_dataset(env) # env.qlearning_dataset() obs_dim = dataset['observations'].shape[1] act_dim = dataset['actions'].shape[1] model = construct_model(obs_dim=obs_dim, act_dim=act_dim, hidden_dim=args.hidden_dim, num_networks=args.num_networks, num_elites=args.num_elites, model_type=args.model_type, separate_mean_var=args.separate_mean_var, name=model_name(args)) dataset['rewards'] = np.expand_dims(dataset['rewards'], 1) train_inputs, train_outputs = format_samples_for_training(dataset) model.train(train_inputs, train_outputs, batch_size=args.batch_size, holdout_ratio=args.holdout_ratio, max_epochs=args.max_epochs, max_t=args.max_t) model.save(args.model_dir, 0)
def train_d4rl_sbc(args): test_env = gym.make(args.env_id) test_env.seed(args.seed) state_space = test_env.observation_space action_space = test_env.action_space # create agent agent = dc.sbc.SBCAgent( state_space.shape[0], action_space.shape[0], args.log_std_low, args.log_std_high, ) # get offline datset dset = d4rl.qlearning_dataset(test_env) dset_size = dset["observations"].shape[0] # create replay buffer buffer = dc.replay.ReplayBuffer( size=dset_size, state_shape=state_space.shape, state_dtype=float, action_shape=action_space.shape, ) buffer.load_experience( dset["observations"], dset["actions"], dset["rewards"], dset["next_observations"], dset["terminals"], ) # run sbc dc.sbc.sbc(agent=agent, test_env=test_env, buffer=buffer, **vars(args))
def create_d4rl_env_and_dataset( task_name, batch_size ): """Create gym environment and dataset for d4rl. Args: task_name: Name of d4rl task. batch_size: Mini batch size. Returns: Gym env and dataset. """ env = gym.make(task_name) env = wrappers.GymWrapper(env) dataset = d4rl.qlearning_dataset(env) states = np.array(dataset['observations'], dtype=np.float32) actions = np.array(dataset['actions'], dtype=np.float32) rewards = np.array(dataset['rewards'], dtype=np.float32) discounts = np.array(np.logical_not(dataset['terminals']), dtype=np.float32) next_states = np.array(dataset['next_observations'], dtype=np.float32) dataset = tf_data.Dataset.from_tensor_slices( Inputs(data=(states, actions, rewards, discounts, next_states)) ).cache().shuffle( states.shape[0], reshuffle_each_iteration=True ).repeat().batch( batch_size, drop_remainder=True ).prefetch(tf_data.experimental.AUTOTUNE) return env, dataset
def populate_replay_buffer(self, env_name): data_envs = { 'HalfCheetah-v2': ( "awac_data/hc_action_noise_15.npy", "awac_data/hc_off_policy_15_demos_100.npy"), 'Ant-v2': ( "awac_data/ant_action_noise_15.npy", "awac_data/ant_off_policy_15_demos_100.npy"), 'Walker2d-v2': ( "awac_data/walker_action_noise_15.npy", "awac_data/walker_off_policy_15_demos_100.npy"), } if env_name in data_envs: print('Loading saved data') for file in data_envs[env_name]: if not os.path.exists(file): warnings.warn(colored('Offline data not found. Follow awac_data/instructions.txt to download. Running without offline data.', 'red')) break data = np.load(file, allow_pickle=True) for demo in data: for transition in list(zip(demo['observations'], demo['actions'], demo['rewards'], demo['next_observations'], demo['terminals'])): self.replay_buffer.store(*transition) else: dataset = d4rl.qlearning_dataset(self.env) N = dataset['rewards'].shape[0] for i in range(N): self.replay_buffer.store(dataset['observations'][i], dataset['actions'][i], dataset['rewards'][i], dataset['next_observations'][i], float(dataset['terminals'][i])) print("Loaded dataset")
def restore_pool_d4rl(replay_pool, name): import gym import d4rl data = d4rl.qlearning_dataset(gym.make(name)) data['rewards'] = np.expand_dims(data['rewards'], axis=1) data['terminals'] = np.expand_dims(data['terminals'], axis=1) replay_pool.add_samples(data)
def load_buffer_d4rl(expert_data_task: str) -> ReplayBuffer: dataset = d4rl.qlearning_dataset(gym.make(expert_data_task)) replay_buffer = ReplayBuffer.from_data( obs=dataset["observations"], act=dataset["actions"], rew=dataset["rewards"], done=dataset["terminals"], obs_next=dataset["next_observations"]) return replay_buffer
def populate_replay_buffer(self): dataset = d4rl.qlearning_dataset(self.env) self.replay_buffer.obs_buf[:dataset['observations'].shape[0],:] = dataset['observations'] self.replay_buffer.act_buf[:dataset['actions'].shape[0],:] = dataset['actions'] self.replay_buffer.obs2_buf[:dataset['next_observations'].shape[0],:] = dataset['next_observations'] self.replay_buffer.rew_buf[:dataset['rewards'].shape[0]] = dataset['rewards'] self.replay_buffer.done_buf[:dataset['terminals'].shape[0]] = dataset['terminals'] self.replay_buffer.size = dataset['observations'].shape[0] self.replay_buffer.ptr = (self.replay_buffer.size+1)%(self.replay_buffer.max_size)
def __init__(self, inputs: str, ioctx: IOContext = None): """Initialize a D4RLReader. Args: inputs (str): String corresponding to D4RL environment name ioctx (IOContext): Current IO context object. """ import d4rl self.env = gym.make(inputs) self.dataset = convert_to_batch(d4rl.qlearning_dataset(self.env)) assert self.dataset.count >= 1 self.counter = 0
def experiment(variant, data): # make new env, reloading with data['evaluation/env'] seems to make bug eval_env = gym.make("panda-v0", **{"headless": variant["headless"]}) eval_env.seed(variant['seed']) expl_env = eval_env qf1 = data['trainer/qf1'] qf2 = data['trainer/qf2'] target_qf1 = data['trainer/target_qf1'] target_qf2 = data['trainer/target_qf2'] policy = data['trainer/policy'] eval_policy = data["evaluation/policy"] eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train(start_epoch=variant["start_epoch"])
def __init__(self, env_name, device, size=0, max_size=int(1e6)): env = gym.make(env_name) dataset = d4rl.qlearning_dataset(env) self.max_size = max_size self.ptr = 0 self.size = size self.state = dataset['observations'] self.action = dataset['actions'] self.next_state = dataset['next_observations'] self.reward = dataset['rewards'] self.not_done = np.invert(dataset['terminals']).astype(int) self.device = device self.size = min(self.size, len(self.reward))
def load_d4rl_buffer(task): env = gym.make(task[5:]) dataset = d4rl.qlearning_dataset(env) buffer = SampleBatch( obs=dataset['observations'], obs_next=dataset['next_observations'], act=dataset['actions'], rew=np.expand_dims(np.squeeze(dataset['rewards']), 1), done=np.expand_dims(np.squeeze(dataset['terminals']), 1), ) logger.info('obs shape: {}', buffer.obs.shape) logger.info('obs_next shape: {}', buffer.obs_next.shape) logger.info('act shape: {}', buffer.act.shape) logger.info('rew shape: {}', buffer.rew.shape) logger.info('done shape: {}', buffer.done.shape) logger.info('Episode reward: {}', buffer.rew.sum() / np.sum(buffer.done)) logger.info('Number of terminals on: {}', np.sum(buffer.done)) return buffer
def get_d4rl_dataset(env, get_num=None) -> dict: """ d4rl dataset: https://github.com/rail-berkeley/d4rl install: pip install git+https://github.com/rail-berkeley/d4rl@master#egg=d4rl :param get_num: how many data get form dataset """ dataset = d4rl.qlearning_dataset(env) if get_num is None: data = dict(obs=dataset['observations'], acts=dataset['actions'], rews=dataset['rewards'], next_obs=dataset['next_observations'], done=dataset['terminals']) else: data_num = dataset['actions'].shape[0] ind = np.random.choice(data_num, size=get_num, replace=False) data = dict(obs=dataset['observations'][ind], acts=dataset['actions'][ind], rews=dataset['rewards'][ind], next_obs=dataset['next_observations'][ind], done=dataset['terminals'][ind]) return data
obs = env.reset() env.render() while True: obs, rew, done, info = env.step(env.action_space.sample()) env.render() # Each task is associated with a dataset # dataset contains observations, actions, rewards, terminals, and infos dataset = env.get_dataset() print(dataset['observations'].shape ) # An N x dim_observation Numpy array of observations (N = 1e6) print(dataset['actions'].shape) print(dataset['rewards'].shape) # Alternatively, use d4rl.qlearning_dataset which # also adds next_observations. dataset = d4rl.qlearning_dataset(env) ''' import gym env = gym.make('HalfCheetah-v2') while True: obs = env.reset() env.render() env.step(env.action_space.sample()) env.render() '''
# Setup Environment env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Set seeds env.seed(args.seed) env.action_space.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Load Dataset if args.env_name == args.dataset: dataset = d4rl.qlearning_dataset(env) # Load d4rl dataset else: if args.dataset == 'hopper-medium-expert': dataset1 = d4rl.qlearning_dataset(gym.make('hopper-medium-v0')) dataset2 = d4rl.qlearning_dataset(gym.make('hopper-expert-v0')) dataset = { key: np.concatenate([dataset1[key], dataset2[key]]) for key in dataset1.keys() } print("Loaded data from hopper-medium-v0 and hopper-expert-v0") else: dataset_file = os.path.dirname(os.path.abspath( __file__)) + '/dataset/' + args.dataset + '.pkl' dataset = pickle.load(open(dataset_file, 'rb')) print("Loaded data from " + dataset_file)
def experiment(variant): eval_env = gym.make( variant['env_name'], **{ "headless": variant["headless"], "verbose": variant["verbose"] }) eval_env.seed(variant['seed']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector(eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) else: dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = CQLTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs']) algorithm.to(ptu.device) # TODO: remove with, once figured out the issue! with torch.autograd.set_detect_anomaly(True): algorithm.train()
def test_cql(): args = get_args() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float print("device:", args.device) print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] print("Max_action", args.max_action) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model # actor network net_a = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, ) actor = ActorProb(net_a, action_shape=args.action_shape, max_action=args.max_action, device=args.device, unbounded=True, conditioned_sigma=True).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) # critic network net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) args.alpha = (target_entropy, log_alpha, alpha_optim) policy = CQLPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, cql_alpha_lr=args.cql_alpha_lr, cql_weight=args.cql_weight, tau=args.tau, gamma=args.gamma, alpha=args.alpha, temperature=args.temperature, with_lagrange=args.with_lagrange, lagrange_threshold=args.lagrange_threshold, min_action=np.min(env.action_space.low), max_action=np.max(env.action_space.high), device=args.device, ) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector test_collector = Collector(policy, test_envs) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "cql" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def watch(): if args.resume_path is None: args.resume_path = os.path.join(log_path, "policy.pth") policy.load_state_dict( torch.load(args.resume_path, map_location=torch.device("cpu"))) policy.eval() collector = Collector(policy, env) collector.collect(n_episode=1, render=1 / 35) if not args.watch: dataset = d4rl.qlearning_dataset(gym.make(args.expert_data_task)) dataset_size = dataset["rewards"].size print("dataset_size", dataset_size) replay_buffer = ReplayBuffer(dataset_size) for i in range(dataset_size): replay_buffer.add( Batch( obs=dataset["observations"][i], act=dataset["actions"][i], rew=dataset["rewards"][i], done=dataset["terminals"][i], obs_next=dataset["next_observations"][i], )) print("dataset loaded") # trainer result = offline_trainer( policy, replay_buffer, test_collector, args.epoch, args.step_per_epoch, args.test_num, args.batch_size, save_best_fn=save_best_fn, logger=logger, ) pprint.pprint(result) else: watch() # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print( f"Final reward: {result['rews'].mean()}, length: {result['lens'].mean()}" )
def test_gail(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv([ lambda: NoRewardEnv(gym.make(args.task)) for _ in range(args.training_num) ], norm_obs=True) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)], norm_obs=True, obs_rms=train_envs.obs_rms, update_obs_rms=False) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, device=args.device) actor = ActorProb(net_a, args.action_shape, max_action=args.max_action, unbounded=True, device=args.device).to(args.device) net_c = Net(args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, device=args.device) critic = Critic(net_c, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): # orthogonal initialization torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) torch.nn.init.zeros_(m.bias) # do last policy layer scaling, this will make initial actions have (close to) # 0 mean and std, and will help boost performances, # see https://arxiv.org/abs/2006.05990, Fig.24 for details for m in actor.mu.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.zeros_(m.bias) m.weight.data.copy_(0.01 * m.weight.data) optim = torch.optim.Adam(ActorCritic(actor, critic).parameters(), lr=args.lr) # discriminator net_d = Net(args.state_shape, action_shape=args.action_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, device=args.device, concat=True) disc_net = Critic(net_d, device=args.device).to(args.device) for m in disc_net.modules(): if isinstance(m, torch.nn.Linear): # orthogonal initialization torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) torch.nn.init.zeros_(m.bias) disc_optim = torch.optim.Adam(disc_net.parameters(), lr=args.disc_lr) lr_scheduler = None if args.lr_decay: # decay learning rate to 0 linearly max_update_num = np.ceil( args.step_per_epoch / args.step_per_collect) * args.epoch lr_scheduler = LambdaLR( optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num) def dist(*logits): return Independent(Normal(*logits), 1) # expert replay buffer dataset = d4rl.qlearning_dataset(gym.make(args.expert_data_task)) dataset_size = dataset['rewards'].size print("dataset_size", dataset_size) expert_buffer = ReplayBuffer(dataset_size) for i in range(dataset_size): expert_buffer.add( Batch( obs=dataset['observations'][i], act=dataset['actions'][i], rew=dataset['rewards'][i], done=dataset['terminals'][i], obs_next=dataset['next_observations'][i], )) print("dataset loaded") policy = GAILPolicy(actor, critic, optim, dist, expert_buffer, disc_net, disc_optim, disc_update_num=args.disc_update_num, discount_factor=args.gamma, gae_lambda=args.gae_lambda, max_grad_norm=args.max_grad_norm, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, action_scaling=True, action_bound_method=args.bound_action_method, lr_scheduler=lr_scheduler, action_space=env.action_space, eps_clip=args.eps_clip, value_clip=args.value_clip, dual_clip=args.dual_clip, advantage_normalization=args.norm_adv, recompute_advantage=args.recompute_adv) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector if args.training_num > 1: buffer = VectorReplayBuffer(args.buffer_size, len(train_envs)) else: buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_gail' log_path = os.path.join(args.logdir, args.task, 'gail', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer, update_interval=100, train_interval=100) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) if not args.watch: # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, step_per_collect=args.step_per_collect, save_best_fn=save_best_fn, logger=logger, test_in_train=False) pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print( f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}' )
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector( eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) elif 'random-expert' in variant['env_name']: load_hdf5(d4rl.basic_dataset(eval_env), replay_buffer) else: load_hdf5(d4rl.qlearning_dataset(eval_env), replay_buffer) trainer = CQLTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def test_il(): args = get_args() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float print("device:", args.device) print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] print("Max_action", args.max_action) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model net = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, ) actor = Actor( net, action_shape=args.action_shape, max_action=args.max_action, device=args.device ).to(args.device) optim = torch.optim.Adam(actor.parameters(), lr=args.lr) policy = ImitationPolicy( actor, optim, action_space=env.action_space, action_scaling=True, action_bound_method="clip" ) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector test_collector = Collector(policy, test_envs) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "cql" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def watch(): if args.resume_path is None: args.resume_path = os.path.join(log_path, "policy.pth") policy.load_state_dict( torch.load(args.resume_path, map_location=torch.device("cpu")) ) policy.eval() collector = Collector(policy, env) collector.collect(n_episode=1, render=1 / 35) if not args.watch: dataset = d4rl.qlearning_dataset(gym.make(args.expert_data_task)) dataset_size = dataset["rewards"].size print("dataset_size", dataset_size) replay_buffer = ReplayBuffer(dataset_size) for i in range(dataset_size): replay_buffer.add( Batch( obs=dataset["observations"][i], act=dataset["actions"][i], rew=dataset["rewards"][i], done=dataset["terminals"][i], obs_next=dataset["next_observations"][i], ) ) print("dataset loaded") # trainer result = offline_trainer( policy, replay_buffer, test_collector, args.epoch, args.step_per_epoch, args.test_num, args.batch_size, save_best_fn=save_best_fn, logger=logger, ) pprint.pprint(result) else: watch() # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print(f"Final reward: {result['rews'].mean()}, length: {result['lens'].mean()}")
def experiment(variant): eval_env = gym.make( variant['env_name'], **{ "headless": variant["headless"], "verbose": variant["verbose"] }) eval_env.seed(variant['seed']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant.get("pretrained_algorithm_path", False): resume(variant) return normalize_env = variant.get('normalize_env', True) env_id = variant.get('env_id', None) env_class = variant.get('env_class', None) env_kwargs = variant.get('env_kwargs', {}) expl_env = make(env_id, env_class, env_kwargs, normalize_env) eval_env = make(env_id, env_class, env_kwargs, normalize_env) seed = int(variant["seed"]) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) eval_env.seed(seed) expl_env.seed(seed) if variant.get('add_env_demos', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_demo_path"]) if variant.get('add_env_offpolicy_data', False): variant["path_loader_kwargs"]["demo_paths"].append(variant["env_offpolicy_data_path"]) path_loader_kwargs = variant.get("path_loader_kwargs", {}) stack_obs = path_loader_kwargs.get("stack_obs", 1) if stack_obs > 1: expl_env = StackObservationEnv(expl_env, stack_obs=stack_obs) eval_env = StackObservationEnv(eval_env, stack_obs=stack_obs) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size if hasattr(expl_env, 'info_sizes'): env_info_sizes = expl_env.info_sizes else: env_info_sizes = dict() qf_kwargs = variant.get("qf_kwargs", {}) qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) vf_kwargs = variant.get("vf_kwargs", dict(hidden_sizes=[256, 256, ],)) vf = ConcatMlp( input_size=obs_dim, output_size=1, **vf_kwargs ) policy_class = variant.get("policy_class", TanhGaussianPolicy) policy_kwargs = variant['policy_kwargs'] policy = policy_class( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_policy = policy exploration_kwargs = variant.get('exploration_kwargs', {}) if exploration_kwargs: if exploration_kwargs.get("deterministic_exploration", False): expl_policy = MakeDeterministic(policy) exploration_strategy = exploration_kwargs.get("strategy", None) if exploration_strategy is None: pass elif exploration_strategy == 'ou': es = OUStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) elif exploration_strategy == 'gauss_eps': es = GaussianAndEpsilonStrategy( action_space=expl_env.action_space, max_sigma=exploration_kwargs['noise'], min_sigma=exploration_kwargs['noise'], # constant sigma epsilon=0, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=expl_policy, ) else: error replay_buffer_kwargs = dict( max_replay_buffer_size=variant['replay_buffer_size'], env=expl_env, ) replay_buffer = variant.get('replay_buffer_class', EnvReplayBuffer)( **replay_buffer_kwargs, ) demo_train_buffer = EnvReplayBuffer( **replay_buffer_kwargs, ) demo_test_buffer = EnvReplayBuffer( **replay_buffer_kwargs, ) trainer_class = variant.get("trainer_class", AWACTrainer) trainer = trainer_class( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vf=vf, **variant['trainer_kwargs'] ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], **variant['algo_kwargs'] ) algorithm.to(ptu.device) if variant.get("save_video", False): def get_img_env(env): renderer = EnvRenderer(**variant["renderer_kwargs"]) img_env = InsertImageEnv(GymToMultiEnv(env), renderer=renderer) image_eval_env = ImageEnv(GymToMultiEnv(eval_env), **variant["image_env_kwargs"]) # image_eval_env = get_img_env(eval_env) image_eval_path_collector = ObsDictPathCollector( image_eval_env, eval_policy, observation_key="state_observation", ) image_expl_env = ImageEnv(GymToMultiEnv(expl_env), **variant["image_env_kwargs"]) # image_expl_env = get_img_env(expl_env) image_expl_path_collector = ObsDictPathCollector( image_expl_env, expl_policy, observation_key="state_observation", ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) if variant.get('save_paths', False): algorithm.post_train_funcs.append(save_paths) if variant.get('load_demos', False): path_loader_class = variant.get('path_loader_class', MDPPathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) path_loader.load_demos() if variant.get('load_env_dataset_demos', False): path_loader_class = variant.get('path_loader_class', HDF5PathLoader) path_loader = path_loader_class(trainer, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, **path_loader_kwargs ) import d4rl dataset = d4rl.qlearning_dataset(expl_env) # dataset = expl_env.get_dataset() path_loader.load_demos(dataset) if variant.get('normalize_rewards_by_return_range'): normalizer = get_normalization(replay_buffer) trainer.reward_transform = normalizer if variant.get('save_initial_buffers', False): buffers = dict( replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, ) buffer_path = osp.join(logger.get_snapshot_dir(), 'buffers.p') pickle.dump(buffers, open(buffer_path, "wb")) algorithm.train()
def test_bcq(): args = get_args() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float print("device:", args.device) print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] print("Max_action", args.max_action) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model # perturbation network net_a = MLP( input_dim=args.state_dim + args.action_dim, output_dim=args.action_dim, hidden_sizes=args.hidden_sizes, device=args.device, ) actor = Perturbation(net_a, max_action=args.max_action, device=args.device, phi=args.phi).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) # vae # output_dim = 0, so the last Module in the encoder is ReLU vae_encoder = MLP( input_dim=args.state_dim + args.action_dim, hidden_sizes=args.vae_hidden_sizes, device=args.device, ) if not args.latent_dim: args.latent_dim = args.action_dim * 2 vae_decoder = MLP( input_dim=args.state_dim + args.latent_dim, output_dim=args.action_dim, hidden_sizes=args.vae_hidden_sizes, device=args.device, ) vae = VAE( vae_encoder, vae_decoder, hidden_dim=args.vae_hidden_sizes[-1], latent_dim=args.latent_dim, max_action=args.max_action, device=args.device, ).to(args.device) vae_optim = torch.optim.Adam(vae.parameters()) policy = BCQPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, vae, vae_optim, device=args.device, gamma=args.gamma, tau=args.tau, lmbda=args.lmbda, ) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector test_collector = Collector(policy, test_envs) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "bcq" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def watch(): if args.resume_path is None: args.resume_path = os.path.join(log_path, "policy.pth") policy.load_state_dict( torch.load(args.resume_path, map_location=torch.device("cpu"))) policy.eval() collector = Collector(policy, env) collector.collect(n_episode=1, render=1 / 35) if not args.watch: dataset = d4rl.qlearning_dataset(gym.make(args.expert_data_task)) dataset_size = dataset["rewards"].size print("dataset_size", dataset_size) replay_buffer = ReplayBuffer(dataset_size) for i in range(dataset_size): replay_buffer.add( Batch( obs=dataset["observations"][i], act=dataset["actions"][i], rew=dataset["rewards"][i], done=dataset["terminals"][i], obs_next=dataset["next_observations"][i], )) print("dataset loaded") # trainer result = offline_trainer( policy, replay_buffer, test_collector, args.epoch, args.step_per_epoch, args.test_num, args.batch_size, save_best_fn=save_best_fn, logger=logger, ) pprint.pprint(result) else: watch() # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print( f"Final reward: {result['rews'].mean()}, length: {result['lens'].mean()}" )
dset["actions"].shape[0], N) assert (dset["rewards"].shape[0] == N ), "Reward number does not match (%d vs %d)" % ( dset["rewards"].shape[0], N) assert (dset["terminals"].shape[0] == N ), "Terminals number does not match (%d vs %d)" % ( dset["terminals"].shape[0], N, ) print("\t num terminals: %d" % np.sum(dset["terminals"])) env.reset() env.step(env.action_space.sample()) score = env.get_normalized_score(0.0) dset = d4rl.qlearning_dataset(env, dataset=dset) assert "observations" in dset, "Observations not in dataset" assert "next_observations" in dset, "Observations not in dataset" assert "actions" in dset, "Actions not in dataset" assert "rewards" in dset, "Rewards not in dataset" assert "terminals" in dset, "Terminals not in dataset" N = dset["observations"].shape[0] print("\t %d samples" % N) assert (dset["next_observations"].shape[0] == N ), "NextObs number does not match (%d vs %d)" % ( dset["actions"].shape[0], N) assert (dset["actions"].shape[0] == N ), "Action number does not match (%d vs %d)" % ( dset["actions"].shape[0], N) assert (dset["rewards"].shape[0] == N ), "Reward number does not match (%d vs %d)" % (
import gym import d4rl import numpy as np from numpy.core.numeric import roll env = gym.make("halfcheetah-expert-v1") offline_data = d4rl.qlearning_dataset(env) print(len(offline_data['observations'])) print(offline_data.keys()) rollouts = [] start_idx = 0 print(np.any(offline_data['terminals'])) exit() for i in range(len(offline_data['observations'])): if offline_data['terminals'][i] == True: rollout = np.array(offline_data['observations'][start_idx:i + 1]) start_idx = i + 1 rollouts.append(rollout) print(len(rollout))