def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = ActorCritic(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=0.001) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) net.train() running_score = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: if args.render: env.render() policy, value = net(state) action = get_action(policy, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 transition = [state, next_state, action, reward, mask] train_model(net, optimizer, transition, policy, value) score += reward state = next_state score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % args.log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(score), running_score) if running_score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def run(args): device = torch.device("cpu") env = gym.make('SpaceInvaders-v0') state_size = env.observation_space.shape action_size = env.action_space.n model = ActorCritic([1, 4, 84, 84], action_size).to(device) opt = SharedRMSprop(model.parameters(), lr=args.lr, alpha=args.alpha, eps=1e-8, weight_decay=args.weight_decay, momentum=args.momentum, centered=False) opt_lock = mp.Lock() scheduler = LRScheduler(args) if args.load_fp: checkpoint = torch.load(args.load_fp) model.load_state_dict(checkpoint['model_state_dict']) opt.load_state_dict(checkpoint['optimizer_state_dict']) if args.train: start = time.time() model.share_memory() model.train() step_counter, max_reward, ma_reward, ma_loss = [ mp.Value('d', 0.0) for _ in range(4) ] processes = [] if args.num_procs == -1: args.num_procs = mp.cpu_count() for rank in range(args.num_procs): p = mp.Process(target=train, args=(rank, args, device, model, opt, opt_lock, scheduler, step_counter, max_reward, ma_reward, ma_loss)) p.start() processes.append(p) for p in processes: p.join() if args.verbose > 0: print(f"Seconds taken: {time.time() - start}") if args.save_fp: torch.save( { 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': opt.state_dict(), }, args.save_fp) if args.test: model.eval() test(args, device, model)
def train_single_player_return(): max_episodes = 50000 episodes_per_update = 10 render = False gamma = 0.99 lr = 0.005 betas = (0.9, 0.999) path = 'models/single_player.pkl' env = SinglePlayerReturn(DiscreteActionBotSim()) policy = ActorCritic() optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) r = [] i_episode = 0 while i_episode < max_episodes: state = env.reset() done = False while not done: action = policy(state) state, reward, done, i = env.step(action) policy.temp_rewards.append(reward) i_episode += 1 r.append(reward) if i_episode % 10: policy.updateMemory(gamma) continue # Updating the policy : optimizer.zero_grad() loss = policy.calculateLoss() loss.backward() policy.clip_grads() optimizer.step() policy.clearMemory() if i_episode % 500 == 0: if len(r) > 50: print('Episode ', i_episode, ': avg reward ', sum(r) / len(r)) else: print('Episode ', i_episode, ': avg reward n/a') if i_episode % 1000 == 0: torch.save(policy, path) if len(r) > 100: r.pop(0) if sum(r) / len(r) > 0.8: r = [] v += 1 print('CONVERGED v', v) break torch.save(policy, path)
class PPO(): def __init__(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip, device): self.lr = lr self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.device = device self.policy = ActorCritic(state_dim, action_dim).to(self.device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.polciy_old = ActorCritic(state_dim, action_dim).to(self.device) self.MseLoss = nn.MSELoss() def update(self, memory): # Monte Carlo estimate of state rewards rewards = [] discount_reward = 0 for reward in reversed(memory.rewards): discount_reward = reward + (self.gamma * discount_reward) rewards.insert(0, discount_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list in tensor old_states = torch.stack(memory.states).to(self.device).detach() old_actions = torch.stack(memory.actions).to(self.device).detach() old_logprobs = torch.stack(memory.logprobs).to(self.device).detach() # Optimize policy for K epochs for _ in range(self.K_epochs): # Evaluating old acions and values: logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta_old) ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss( state_values, rewards) - 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())
def train(): # Defaults parameters: # gamma = 0.99 # lr = 0.02 # betas = (0.9, 0.999) # random_seed = 543 render = False gamma = 0.99 lr = 0.02 betas = (0.9, 0.999) random_seed = 543 torch.manual_seed(random_seed) env = gym.make('LunarLander-v2') env.seed(random_seed) policy = ActorCritic() optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) print(lr,betas) running_reward = 0 for i_episode in range(0, 10000): state = env.reset() for t in range(10000): action = policy(state) state, reward, done, _ = env.step(action) policy.rewards.append(reward) running_reward += reward if render and i_episode > 1000: env.render() if done: break # Updating the policy : optimizer.zero_grad() loss = policy.calculateLoss(gamma) loss.backward() optimizer.step() policy.clearMemory() # saving the model if episodes > 999 OR avg reward > 200 #if i_episode > 999: # torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) if running_reward > 4000: torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) print("########## Solved! ##########") test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) break if i_episode % 20 == 0: running_reward = running_reward/20 print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward)) running_reward = 0
def run(): dummy_env = get_env(env_name) model = ActorCritic(dummy_env.observation_space.shape[0] * num_stack, dummy_env.action_space) del dummy_env optimizer = optimizer = optim.RMSprop( model.parameters(), lr, eps=eps, alpha=alpha) train(model, optimizer, lambda: get_env(env_name), num_envs, num_stack, num_steps, num_updates, gamma, value_loss_coef, entropy_coef, max_grad_norm)
def load_checkpoint(filepath): # checkpoint = torch.load(filepath) # model = checkpoint['model'] # model.load_state_dict(checkpoint['state_dict']) # for parameter in model.parameters(): # parameter.requires_grad = False # model.eval() ##################### model = ActorCritic(len(state), params.output_space) optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr) checkpoint = torch.load(params.file_path_shared_model) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) model.eval() model_test = ActorCritic(len(state), params.output_space) optimizer_test = my_optim.SharedAdam(model_test.parameters(), lr=params.lr) checkpoint = torch.load(params.file_path_shared_model_test) model_test.load_state_dict(checkpoint['state_dict']) optimizer_test.load_state_dict(checkpoint['optimizer']) model_test.eval() ########################### return model