class SAC: def __init__(self, env_id, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_v=1e-3, lr_q=1e-3, gamma=0.99, polyak=0.995, explore_size=10000, step_per_iter=3000, batch_size=100, min_update_step=1000, update_step=50, target_update_delay=1, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.polyak = polyak self.memory = FixedMemory(memory_size) self.explore_size = explore_size self.step_per_iter = step_per_iter self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.lr_q = lr_q self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.target_update_delay = target_update_delay self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert env_continuous, "SAC is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, max_action=self.action_high, use_sac=True).to(device) self.value_net = Value(num_states).to(device) self.value_net_target = Value(num_states).to(device) self.q_net_1 = QValue(num_states, self.num_actions).to(device) self.q_net_2 = QValue(num_states, self.num_actions).to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_sac.p".format(self.env_id)) self.policy_net, self.value_net, self.q_net_1, self.q_net_2, self.running_state \ = pickle.load(open('{}/{}_sac.p'.format(self.model_path, self.env_id), "rb")) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, _ = self.policy_net.rsample(state) action = action.cpu().numpy()[0] return action, None def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action, _ = self.choose_action(state) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """interact""" global_steps = (i_iter - 1) * self.step_per_iter + 1 log = dict() num_steps = 0 num_episodes = 0 total_reward = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') while num_steps < self.step_per_iter: state = self.env.reset() state = self.running_state(state) episode_reward = 0 for t in range(10000): if self.render: self.env.render() if global_steps < self.explore_size: # explore action = self.env.action_space.sample() else: # action action, _ = self.choose_action(state) next_state, reward, done, _ = self.env.step(action) next_state = self.running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask, None) episode_reward += reward global_steps += 1 num_steps += 1 if global_steps >= self.min_update_step and global_steps % self.update_step == 0: for k in range(1, self.update_step + 1): batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch, k) if done or num_steps >= self.step_per_iter: break state = next_state num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) self.env.close() log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}") # record reward information writer.add_scalars( "sac", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) def update(self, batch, k_iter): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by SAC sac_step(self.policy_net, self.value_net, self.value_net_target, self.q_net_1, self.q_net_2, self.optimizer_p, self.optimizer_v, self.optimizer_q_1, self.optimizer_q_2, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, k_iter % self.target_update_delay == 0) def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.q_net_1, self.q_net_2, self.running_state), open('{}/{}_sac.p'.format(save_path, self.env_id), 'wb'))
class PPO: def __init__(self, env_id, render=False, num_process=4, min_batch_size=2048, lr_p=3e-4, lr_v=3e-4, gamma=0.99, tau=0.95, clip_epsilon=0.2, ppo_epochs=10, ppo_mini_batch_size=64, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.tau = tau self.ppo_epochs = ppo_epochs self.ppo_mini_batch_size = ppo_mini_batch_size self.clip_epsilon = clip_epsilon self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.min_batch_size = min_batch_size self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions).to(device) else: self.policy_net = DiscretePolicy(num_states, num_actions).to(device) self.value_net = Value(num_states).to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ppo.p".format(self.env_id)) self.policy_net, self.value_net, self.running_state = pickle.load( open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb")) self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) return action, log_prob def eval(self, i_iter, render=False): state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action, _ = self.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "ppo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) for _ in range(self.ppo_epochs): if self.ppo_mini_batch_size: batch_size = batch_state.shape[0] mini_batch_num = int( math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \ batch_return[ ind], batch_advantage[ind], \ batch_log_prob[ ind] v_loss, p_loss = ppo_step( self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) else: v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) return v_loss, p_loss def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_ppo.p'.format(save_path, self.env_id), 'wb'))
class TRPO: def __init__(self, env_id, render=False, num_process=1, min_batch_size=2048, lr_v=3e-4, gamma=0.99, tau=0.95, max_kl=1e-2, damping=1e-2, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.tau = tau self.max_kl = max_kl self.damping = damping self.render = render self.num_process = num_process self.lr_v = lr_v self.min_batch_size = min_batch_size self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions).double().to( device) # current policy else: self.policy_net = DiscretePolicy(num_states, num_actions).double().to(device) self.value_net = Value(num_states).double().to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_trpo.p".format(self.env_id)) self.policy_net, self.value_net, self.running_state = pickle.load( open('{}/{}_trpo.p'.format(self.model_path, self.env_id), "rb")) self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state): """select action""" state = DOUBLE(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) return action, log_prob def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action, _ = self.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "trpo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) batch_log_prob = DOUBLE(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) # update by TRPO trpo_step(self.policy_net, self.value_net, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.max_kl, self.damping, 1e-3, None) def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_trpo.p'.format(save_path, self.env_id), 'wb'))
class GAIL: def __init__(self, render=False, num_process=4, config=None, expert_data_path=None, env_id=None): self.render = render self.env_id = env_id self.num_process = num_process self.expert_data_path = expert_data_path self.config = config self._load_expert_trajectory() self._init_model() def _load_expert_trajectory(self): self.expert_dataset = ExpertDataset( expert_data_path=self.expert_data_path, train_fraction=self.config["expert_data"]["train_fraction"], traj_limitation=self.config["expert_data"]["traj_limitation"], shuffle=self.config["expert_data"]["shuffle"], batch_size=self.config["expert_data"]["batch_size"]) def _init_model(self): # seeding seed = self.config["train"]["general"]["seed"] torch.manual_seed(seed) np.random.seed(seed) self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # check env assert num_states == self.expert_dataset.num_states and num_actions == self.expert_dataset.num_actions, \ "Expected corresponding expert dataset and env" dim_dict = {"dim_state": num_states, "dim_action": num_actions} self.config["value"].update(dim_dict) self.config["policy"].update(dim_dict) self.config["discriminator"].update(dim_dict) self.value = Value(dim_state=self.config["value"]["dim_state"], dim_hidden=self.config["value"]["dim_hidden"], activation=resolve_activate_function( self.config["value"]["activation"])) self.policy = Policy(config=self.config["policy"]) self.discriminator = Discriminator( dim_state=self.config["discriminator"]["dim_state"], dim_action=self.config["discriminator"]["dim_action"], dim_hidden=self.config["discriminator"]["dim_hidden"], activation=resolve_activate_function( self.config["discriminator"]["activation"])) self.discriminator_func = nn.BCELoss() self.running_state = None self.collector = MemoryCollector(self.env, self.policy, render=self.render, running_state=self.running_state, num_process=self.num_process) print("Model Structure") print(self.policy) print(self.value) print(self.discriminator) print() self.optimizer_policy = optim.Adam( self.policy.parameters(), lr=self.config["policy"]["learning_rate"]) self.optimizer_value = optim.Adam( self.value.parameters(), lr=self.config["value"]["learning_rate"]) self.optimizer_discriminator = optim.Adam( self.discriminator.parameters(), lr=self.config["discriminator"]["learning_rate"]) to_device(self.value, self.policy, self.discriminator, self.discriminator_func) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy.get_action_log_prob(state) return action, log_prob def learn(self, writer, i_iter): memory, log = self.collector.collect_samples( self.config["train"]["generator"]["sample_batch_size"]) self.policy.train() self.value.train() self.discriminator.train() print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("gail/average reward", log['avg_reward'], i_iter) writer.add_scalar("gail/num steps", log['num_steps'], i_iter) # collect generated batch # gen_batch = self.collect_samples(self.config["ppo"]["sample_batch_size"]) gen_batch = memory.sample() gen_batch_state = FLOAT(gen_batch.state).to( device) # [batch size, state size] gen_batch_action = FLOAT(gen_batch.action).to( device) # [batch size, action size] gen_batch_old_log_prob = FLOAT(gen_batch.log_prob).to( device) # [batch size, 1] gen_batch_mask = FLOAT(gen_batch.mask).to(device) # [batch, 1] #################################################### # update discriminator #################################################### d_optim_i_iters = self.config["train"]["discriminator"]["optim_step"] if i_iter % d_optim_i_iters == 0: for step, (expert_batch_state, expert_batch_action) in enumerate( self.expert_dataset.train_loader): if step >= d_optim_i_iters: break # calculate probs and logits gen_prob, gen_logits = self.discriminator( gen_batch_state, gen_batch_action) expert_prob, expert_logits = self.discriminator( expert_batch_state.to(device), expert_batch_action.to(device)) # calculate accuracy gen_acc = torch.mean((gen_prob < 0.5).float()) expert_acc = torch.mean((expert_prob > 0.5).float()) # calculate regression loss expert_labels = torch.ones_like(expert_prob) gen_labels = torch.zeros_like(gen_prob) e_loss = self.discriminator_func(expert_prob, target=expert_labels) g_loss = self.discriminator_func(gen_prob, target=gen_labels) d_loss = e_loss + g_loss # calculate entropy loss logits = torch.cat([gen_logits, expert_logits], 0) entropy = ((1. - torch.sigmoid(logits)) * logits - torch.nn.functional.logsigmoid(logits)).mean() entropy_loss = - \ self.config["train"]["discriminator"]["ent_coeff"] * entropy total_loss = d_loss + entropy_loss self.optimizer_discriminator.zero_grad() total_loss.backward() self.optimizer_discriminator.step() writer.add_scalar('discriminator/d_loss', d_loss.item(), i_iter) writer.add_scalar("discriminator/e_loss", e_loss.item(), i_iter) writer.add_scalar("discriminator/g_loss", g_loss.item(), i_iter) writer.add_scalar("discriminator/ent", entropy.item(), i_iter) writer.add_scalar('discriminator/expert_acc', gen_acc.item(), i_iter) writer.add_scalar('discriminator/gen_acc', expert_acc.item(), i_iter) #################################################### # update policy by ppo [mini_batch] #################################################### with torch.no_grad(): gen_batch_value = self.value(gen_batch_state) d_out, _ = self.discriminator(gen_batch_state, gen_batch_action) gen_batch_reward = -torch.log(1 - d_out + 1e-6) gen_batch_advantage, gen_batch_return = estimate_advantages( gen_batch_reward, gen_batch_mask, gen_batch_value, self.config["train"]["generator"]["gamma"], self.config["train"]["generator"]["tau"]) ppo_optim_i_iters = self.config["train"]["generator"]["optim_step"] ppo_mini_batch_size = self.config["train"]["generator"][ "mini_batch_size"] for _ in range(ppo_optim_i_iters): if ppo_mini_batch_size > 0: gen_batch_size = gen_batch_state.shape[0] optim_iter_num = int( math.ceil(gen_batch_size / ppo_mini_batch_size)) perm = torch.randperm(gen_batch_size) for i in range(optim_iter_num): ind = perm[slice( i * ppo_mini_batch_size, min((i + 1) * ppo_mini_batch_size, gen_batch_size))] mini_batch_state, mini_batch_action, mini_batch_advantage, mini_batch_return, \ mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], \ gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[ ind] v_loss, p_loss, ent_loss = ppo_step( policy_net=self.policy, value_net=self.value, optimizer_policy=self.optimizer_policy, optimizer_value=self.optimizer_value, optim_value_iternum=self.config["value"] ["optim_value_iter"], states=mini_batch_state, actions=mini_batch_action, returns=mini_batch_return, old_log_probs=mini_batch_old_log_prob, advantages=mini_batch_advantage, clip_epsilon=self.config["train"]["generator"] ["clip_ratio"], l2_reg=self.config["value"]["l2_reg"]) else: v_loss, p_loss, ent_loss = ppo_step( policy_net=self.policy, value_net=self.value, optimizer_policy=self.optimizer_policy, optimizer_value=self.optimizer_value, optim_value_iternum=self.config["value"] ["optim_value_iter"], states=gen_batch_state, actions=gen_batch_action, returns=gen_batch_return, old_log_probs=gen_batch_old_log_prob, advantages=gen_batch_advantage, clip_epsilon=self.config["train"]["generator"] ["clip_ratio"], l2_reg=self.config["value"]["l2_reg"]) writer.add_scalar('generator/p_loss', p_loss, i_iter) writer.add_scalar('generator/v_loss', v_loss, i_iter) writer.add_scalar('generator/ent_loss', ent_loss, i_iter) print(f" Training episode:{i_iter} ".center(80, "#")) print('d_gen_prob:', gen_prob.mean().item()) print('d_expert_prob:', expert_prob.mean().item()) print('d_loss:', d_loss.item()) print('e_loss:', e_loss.item()) print("d/bernoulli_entropy:", entropy.item()) def eval(self, i_iter, render=False): self.policy.eval() self.value.eval() self.discriminator.eval() state = self.env.reset() test_reward = 0 while True: if render: self.env.render() if self.running_state: state = self.running_state(state) action, _ = self.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def save_model(self, save_path): check_path(save_path) # torch.save((self.discriminator, self.policy, self.value), f"{save_path}/{self.exp_name}.pt") torch.save(self.discriminator, f"{save_path}/{self.env_id}_Discriminator.pt") torch.save(self.policy, f"{save_path}/{self.env_id}_Policy.pt") torch.save(self.value, f"{save_path}/{self.env_id}_Value.pt") def load_model(self, model_path): # load entire model # self.discriminator, self.policy, self.value = torch.load(model_path, map_location=device) self.discriminator = torch.load(f"{model_path}_Discriminator.pt", map_location=device) self.policy = torch.load(f"{model_path}_Policy.pt", map_location=device) self.value = torch.load(f"{model_path}_Value.pt", map_location=device)