def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = ActorCritic(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    
    net.to(device)
    net.train()
    running_score = 0

    for e in range(3000):
        done = False
        score = 0

        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            if args.render:
                env.render()

            policy, value = net(state)
            action = get_action(policy, num_actions)

            next_state, reward, done, _ = env.step(action)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            
            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            transition = [state, next_state, action, reward, mask]
            train_model(net, optimizer, transition, policy, value)

            score += reward
            state = next_state

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % args.log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))
            writer.add_scalar('log/score', float(score), running_score)

        if running_score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break
Ejemplo n.º 2
0
def run(args):
    device = torch.device("cpu")
    env = gym.make('SpaceInvaders-v0')
    state_size = env.observation_space.shape
    action_size = env.action_space.n

    model = ActorCritic([1, 4, 84, 84], action_size).to(device)
    opt = SharedRMSprop(model.parameters(),
                        lr=args.lr,
                        alpha=args.alpha,
                        eps=1e-8,
                        weight_decay=args.weight_decay,
                        momentum=args.momentum,
                        centered=False)
    opt_lock = mp.Lock()
    scheduler = LRScheduler(args)

    if args.load_fp:
        checkpoint = torch.load(args.load_fp)
        model.load_state_dict(checkpoint['model_state_dict'])
        opt.load_state_dict(checkpoint['optimizer_state_dict'])

    if args.train:
        start = time.time()

        model.share_memory()
        model.train()

        step_counter, max_reward, ma_reward, ma_loss = [
            mp.Value('d', 0.0) for _ in range(4)
        ]

        processes = []
        if args.num_procs == -1:
            args.num_procs = mp.cpu_count()
        for rank in range(args.num_procs):
            p = mp.Process(target=train,
                           args=(rank, args, device, model, opt, opt_lock,
                                 scheduler, step_counter, max_reward,
                                 ma_reward, ma_loss))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        if args.verbose > 0:
            print(f"Seconds taken: {time.time() - start}")
        if args.save_fp:
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    # 'optimizer_state_dict': opt.state_dict(),
                },
                args.save_fp)

    if args.test:
        model.eval()
        test(args, device, model)
Ejemplo n.º 3
0
def train_single_player_return():
    max_episodes = 50000
    episodes_per_update = 10
    render = False
    gamma = 0.99
    lr = 0.005
    betas = (0.9, 0.999)
    path = 'models/single_player.pkl'

    env = SinglePlayerReturn(DiscreteActionBotSim())

    policy = ActorCritic()

    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)

    r = []
    i_episode = 0
    while i_episode < max_episodes:
        state = env.reset()
        done = False
        while not done:
            action = policy(state)
            state, reward, done, i = env.step(action)
            policy.temp_rewards.append(reward)

        i_episode += 1
        r.append(reward)

        if i_episode % 10:
            policy.updateMemory(gamma)
            continue

        # Updating the policy :
        optimizer.zero_grad()
        loss = policy.calculateLoss()
        loss.backward()
        policy.clip_grads()
        optimizer.step()
        policy.clearMemory()

        if i_episode % 500 == 0:
            if len(r) > 50:
                print('Episode ', i_episode, ': avg reward ', sum(r) / len(r))
            else:
                print('Episode ', i_episode, ': avg reward n/a')

        if i_episode % 1000 == 0:
            torch.save(policy, path)

        if len(r) > 100:
            r.pop(0)
            if sum(r) / len(r) > 0.8:
                r = []
                v += 1
                print('CONVERGED v', v)
                break

    torch.save(policy, path)
Ejemplo n.º 4
0
class PPO():
    def __init__(self, state_dim, action_dim, lr, betas, gamma, K_epochs,
                 eps_clip, device):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.device = device

        self.policy = ActorCritic(state_dim, action_dim).to(self.device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=lr,
                                          betas=betas)
        self.polciy_old = ActorCritic(state_dim, action_dim).to(self.device)

        self.MseLoss = nn.MSELoss()

    def update(self, memory):
        # Monte Carlo estimate of state rewards
        rewards = []
        discount_reward = 0
        for reward in reversed(memory.rewards):
            discount_reward = reward + (self.gamma * discount_reward)
            rewards.insert(0, discount_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list in tensor
        old_states = torch.stack(memory.states).to(self.device).detach()
        old_actions = torch.stack(memory.actions).to(self.device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(self.device).detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluating old acions and values:
            logprobs, state_values, dist_entropy = self.policy.evaluate(
                old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta_old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip)
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(
                state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
Ejemplo n.º 5
0
def train():
    # Defaults parameters:
    #    gamma = 0.99
    #    lr = 0.02
    #    betas = (0.9, 0.999)
    #    random_seed = 543

    render = False
    gamma = 0.99
    lr = 0.02
    betas = (0.9, 0.999)
    random_seed = 543
    
    torch.manual_seed(random_seed)
    
    env = gym.make('LunarLander-v2')
    env.seed(random_seed)
    
    policy = ActorCritic()
    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
    print(lr,betas)
    
    running_reward = 0
    for i_episode in range(0, 10000):
        state = env.reset()
        for t in range(10000):
            action = policy(state)
            state, reward, done, _ = env.step(action)
            policy.rewards.append(reward)
            running_reward += reward
            if render and i_episode > 1000:
                env.render()
            if done:
                break
                    
        # Updating the policy :
        optimizer.zero_grad()
        loss = policy.calculateLoss(gamma)
        loss.backward()
        optimizer.step()        
        policy.clearMemory()
        
        # saving the model if episodes > 999 OR avg reward > 200 
        #if i_episode > 999:
        #    torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
        
        if running_reward > 4000:
            torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
            print("########## Solved! ##########")
            test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
            break
        
        if i_episode % 20 == 0:
            running_reward = running_reward/20
            print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
            running_reward = 0
Ejemplo n.º 6
0
Archivo: main.py Proyecto: agajews/deep
def run():
    dummy_env = get_env(env_name)
    model = ActorCritic(dummy_env.observation_space.shape[0] * num_stack,
                        dummy_env.action_space)
    del dummy_env
    optimizer = optimizer = optim.RMSprop(
        model.parameters(), lr, eps=eps, alpha=alpha)
    train(model, optimizer, lambda: get_env(env_name), num_envs, num_stack,
          num_steps, num_updates, gamma, value_loss_coef, entropy_coef,
          max_grad_norm)
Ejemplo n.º 7
0
def load_checkpoint(filepath):
    #    checkpoint = torch.load(filepath)
    #    model = checkpoint['model']
    #    model.load_state_dict(checkpoint['state_dict'])
    #    for parameter in model.parameters():
    #        parameter.requires_grad = False
    #    model.eval()
    #####################
    model = ActorCritic(len(state), params.output_space)
    optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr)
    checkpoint = torch.load(params.file_path_shared_model)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    model.eval()

    model_test = ActorCritic(len(state), params.output_space)
    optimizer_test = my_optim.SharedAdam(model_test.parameters(), lr=params.lr)
    checkpoint = torch.load(params.file_path_shared_model_test)
    model_test.load_state_dict(checkpoint['state_dict'])
    optimizer_test.load_state_dict(checkpoint['optimizer'])
    model_test.eval()
    ###########################
    return model