def test(rank, params, shared_p): torch.manual_seed(params.seed + rank) env = gym.make(params.env_name) num_inputs = env.observation_space.shape[0] num_outputs = env.action_space.shape[0] policy = Policy(num_inputs, num_outputs) state = env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) reward_sum = 0 done = True start_time = time.time() episode_length = 0 while True: episode_length += 1 policy.load_state_dict(shared_p.state_dict()) mu, _ = policy(state) action = mu.data env_action = action.squeeze().numpy() state, reward, done, _ = env.step(env_action) reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 state = env.reset() time.sleep(60) state = Variable(torch.Tensor(state).unsqueeze(0))
def load_models(args, state_size, n_drones, action_size): nets = [] for i in range(n_drones): model = Policy(state_size, n_drones, action_size, policy_type=args.policy) model.load_state_dict( torch.load( f"A2C_models/{args.policy}_policy/A2C_drone_{icm_model_name}{i}.bin" )) nets.append(model) return nets
def train(rank, params, shared_p, shared_v, optimizer_p, optimizer_v): torch.manual_seed(params.seed + rank) env = gym.make(params.env_name) num_inputs = env.observation_space.shape[0] num_outputs = env.action_space.shape[0] policy = Policy(num_inputs, num_outputs) value = Value(num_inputs) memory = ReplayMemory(1e6) batch_size = 10000 state = env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) done = True episode_length = 0 while True: episode_length += 1 policy.load_state_dict(shared_p.state_dict()) value.load_state_dict(shared_v.state_dict()) w = -1 while w < batch_size: states = [] actions = [] rewards = [] values = [] returns = [] advantages = [] # Perform K steps for step in range(params.num_steps): w += 1 states.append(state) mu, sigma_sq = policy(state) eps = torch.randn(mu.size()) action = (mu + sigma_sq.sqrt()*Variable(eps)) actions.append(action) v = value(state) values.append(v) env_action = action.data.squeeze().numpy() state, reward, done, _ = env.step(env_action) done = (done or episode_length >= params.max_episode_length) reward = max(min(reward, 1), -1) rewards.append(reward) if done: episode_length = 0 state = env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) if done: break R = torch.zeros(1, 1) if not done: v = value(state) R = v.data # compute returns and advantages: values.append(Variable(R)) R = Variable(R) for i in reversed(range(len(rewards))): R = params.gamma * R + rewards[i] returns.insert(0, R) A = R - values[i] advantages.insert(0, A) # store usefull info: memory.push([states, actions, returns, advantages]) batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(batch_size) # policy grad updates: mu_old, sigma_sq_old = policy(batch_states) probs_old = normal(batch_actions, mu_old, sigma_sq_old) policy_new = Policy(num_inputs, num_outputs) kl = 0. kl_coef = 1. kl_target = Variable(torch.Tensor([params.kl_target])) for m in range(100): policy_new.load_state_dict(shared_p.state_dict()) mu_new, sigma_sq_new = policy_new(batch_states) probs_new = normal(batch_actions, mu_new, sigma_sq_new) policy_loss = torch.mean(batch_advantages * torch.sum(probs_new/probs_old,1)) kl = torch.mean(probs_old * torch.log(probs_old/probs_new)) kl_loss = kl_coef * kl + \ params.ksi * torch.clamp(kl-2*kl_target, max=0)**2 total_policy_loss = - policy_loss + kl_loss if kl > 4*kl_target: break # assynchronous update: optimizer_p.zero_grad() total_policy_loss.backward() ensure_shared_grads(policy_new, shared_p) optimizer_p.step() # value grad updates: for b in range(100): value.load_state_dict(shared_v.state_dict()) v = value(batch_states) value_loss = torch.mean((batch_returns - v)**2) # assynchronous update: optimizer_v.zero_grad() value_loss.backward() ensure_shared_grads(value, shared_v) optimizer_v.step() if kl > params.beta_hight*kl_target: kl_coef *= params.alpha if kl < params.beta_low*kl_target: kl_coef /= params.alpha print("update done !")
def main(): # Training settings parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--act-every', type=int, default=3, help='act every N frames (default: 3)') parser.add_argument('--duration', type=int, default=1800, help='duration in frames (default: 1800 = 30 seconds)') parser.add_argument('--dummy', action='store_true', default=False, help='Dummy env') parser.add_argument('--human', action='store_true', default=False, help='P2 is human') parser.add_argument('--load-model', type=str, default='results/latest/model.pth') args = parser.parse_args() args.max_rollout_length = args.duration if args.human: options["player2"] = "human" env = Env(args, 'cpu', options=options, dummy=args.dummy) observation_dim = env.observation_dim action_dim = env.action_dim print(action_dim) actor = Policy(action_dim) actor_state_dict = torch.load(args.load_model, map_location='cpu') actor.load_state_dict(actor_state_dict, strict=False) obs = env.reset() t = 0 r = 0 # actions = [0,0] actions = 0 with torch.no_grad(): while True: try: action = actor.act(obs) action = action[0].cpu().numpy() obs, reward, done = env.step(action) if done: env.close() break if t >= args.duration: env.close() break except Exception as e: print(e) env.close() raise break t += 1
class Actor(object): def __init__(self, args, rollout_queue, shared_state_dict, actor_name=None, rank=0): self.args = args self.rollout_queue = rollout_queue self.actor_name = actor_name self.rank = rank # self.device = 'cpu' # args.device self.device = args.device self.env = None self.policy = None self.memory = None self.shared_state_dict = shared_state_dict def initialize(self): print('Build Environment for {}'.format(self.actor_name)) if self.env is None: self.env = Env(self.args, self.device, options=self.args.options, dummy=self.args.dummy, rank=self.rank) self.policy = Policy(self.env.action_dim).to(self.device) self.memory = Memory() def performing(self): torch.manual_seed(self.args.seed + self.rank) self.initialize() obs = self.env.reset() with torch.no_grad(): while True: self.policy.load_state_dict( self.shared_state_dict.state_dict()) try: self.policy.reset_rnn() obs = self.env.reset() except: obs = obs[-1:] print(obs.shape) self.memory.observations.append(obs) # print(obs.shape) for step in range(self.args.num_steps): action, action_log_prob = self.policy(obs) self.memory.actions.append(action) self.memory.actions_log_probs.append(action_log_prob) send_action = action[-1].cpu().numpy() obs, reward, done = self.env.step(send_action) self.memory.observations.append(obs) self.memory.rewards.append( torch.from_numpy(reward.astype(np.float32))) # print("actor", obs.shape, action.shape, action_log_prob.shape, reward.shape) action, action_log_prob = self.policy(obs) self.memory.actions.append(action[0:-1]) self.memory.actions_log_probs.append(action_log_prob[0:-1]) # print(self.rollout_queue.qsize()) self.rollout_queue.put(self.memory.get_batch())