Esempio n. 1
0
def coordinator(rank, args, share_model, exp_queues, model_params):
    assert len(exp_queues) == args.num_processes

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # print(device)

    model = ActorCritic()
    model.train()
    # model.load_state_dict(share_model.state_dict())
    for i in range(args.num_processes):
        model_params[i].put(model.state_dict())

    # if args.cuda:
    # model = model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5)
    entropy_coef = args.entropy_coef

    count = 0
    while True:
        count += 1
        if count >= 14000:
            entropy_coef = 1
        if count >= 17000:
            entropy_coef = 0.5
        if count >= 19000:
            entropy_coef = 0.1

        # assemble experiences from the agents
        for i in range(args.num_processes):
            s_batch, a_batch, r_batch, done = exp_queues[i].get()
            loss = compute_loss(args, s_batch, a_batch, r_batch, done, model,
                                entropy_coef)
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            if torch.isnan(loss):
                torch.save(s_batch, 's_batch-coor.pt')
                torch.save(loss, 'loss.pt')
                print('s_batch', s_batch)
                print('loss: ', loss)
                break
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            # for param in model.parameters():
            # param.grad.data.clamp_(-1, 1)
            optimizer.step()
        print('update model parameters ', count)
        if torch.isnan(loss):
            break
        # model.zero_grad()
        # if args.cuda:
        # model = model.cpu()
        for i in range(args.num_processes):
            model_params[i].put(model.state_dict())
        share_model.load_state_dict(model.state_dict())
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = ActorCritic(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    
    net.to(device)
    net.train()
    running_score = 0

    for e in range(3000):
        done = False
        score = 0

        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            if args.render:
                env.render()

            policy, value = net(state)
            action = get_action(policy, num_actions)

            next_state, reward, done, _ = env.step(action)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            
            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            transition = [state, next_state, action, reward, mask]
            train_model(net, optimizer, transition, policy, value)

            score += reward
            state = next_state

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % args.log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))
            writer.add_scalar('log/score', float(score), running_score)

        if running_score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break
Esempio n. 3
0
def run(args):
    device = torch.device("cpu")
    env = gym.make('SpaceInvaders-v0')
    state_size = env.observation_space.shape
    action_size = env.action_space.n

    model = ActorCritic([1, 4, 84, 84], action_size).to(device)
    opt = SharedRMSprop(model.parameters(),
                        lr=args.lr,
                        alpha=args.alpha,
                        eps=1e-8,
                        weight_decay=args.weight_decay,
                        momentum=args.momentum,
                        centered=False)
    opt_lock = mp.Lock()
    scheduler = LRScheduler(args)

    if args.load_fp:
        checkpoint = torch.load(args.load_fp)
        model.load_state_dict(checkpoint['model_state_dict'])
        opt.load_state_dict(checkpoint['optimizer_state_dict'])

    if args.train:
        start = time.time()

        model.share_memory()
        model.train()

        step_counter, max_reward, ma_reward, ma_loss = [
            mp.Value('d', 0.0) for _ in range(4)
        ]

        processes = []
        if args.num_procs == -1:
            args.num_procs = mp.cpu_count()
        for rank in range(args.num_procs):
            p = mp.Process(target=train,
                           args=(rank, args, device, model, opt, opt_lock,
                                 scheduler, step_counter, max_reward,
                                 ma_reward, ma_loss))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        if args.verbose > 0:
            print(f"Seconds taken: {time.time() - start}")
        if args.save_fp:
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    # 'optimizer_state_dict': opt.state_dict(),
                },
                args.save_fp)

    if args.test:
        model.eval()
        test(args, device, model)
Esempio n. 4
0
class PPO():
    def __init__(self, state_dim, action_dim, lr, betas, gamma, K_epochs,
                 eps_clip, device):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.device = device

        self.policy = ActorCritic(state_dim, action_dim).to(self.device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=lr,
                                          betas=betas)
        self.polciy_old = ActorCritic(state_dim, action_dim).to(self.device)

        self.MseLoss = nn.MSELoss()

    def update(self, memory):
        # Monte Carlo estimate of state rewards
        rewards = []
        discount_reward = 0
        for reward in reversed(memory.rewards):
            discount_reward = reward + (self.gamma * discount_reward)
            rewards.insert(0, discount_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list in tensor
        old_states = torch.stack(memory.states).to(self.device).detach()
        old_actions = torch.stack(memory.actions).to(self.device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(self.device).detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluating old acions and values:
            logprobs, state_values, dist_entropy = self.policy.evaluate(
                old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta_old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip)
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(
                state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
Esempio n. 5
0
def train():
    # Defaults parameters:
    #    gamma = 0.99
    #    lr = 0.02
    #    betas = (0.9, 0.999)
    #    random_seed = 543

    render = False
    gamma = 0.99
    lr = 0.02
    betas = (0.9, 0.999)
    random_seed = 543
    
    torch.manual_seed(random_seed)
    
    env = gym.make('LunarLander-v2')
    env.seed(random_seed)
    
    policy = ActorCritic()
    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
    print(lr,betas)
    
    running_reward = 0
    for i_episode in range(0, 10000):
        state = env.reset()
        for t in range(10000):
            action = policy(state)
            state, reward, done, _ = env.step(action)
            policy.rewards.append(reward)
            running_reward += reward
            if render and i_episode > 1000:
                env.render()
            if done:
                break
                    
        # Updating the policy :
        optimizer.zero_grad()
        loss = policy.calculateLoss(gamma)
        loss.backward()
        optimizer.step()        
        policy.clearMemory()
        
        # saving the model if episodes > 999 OR avg reward > 200 
        #if i_episode > 999:
        #    torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
        
        if running_reward > 4000:
            torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
            print("########## Solved! ##########")
            test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
            break
        
        if i_episode % 20 == 0:
            running_reward = running_reward/20
            print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
            running_reward = 0
Esempio n. 6
0
class A2C:
    def __init__(self, state_dim, action_dim, cfg):
        self.gamma = cfg.gamma
        self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.lr)
        self.device = cfg.device
        self.loss = 0
        self.env = cfg.env

    def choose_action(self, state):
        state = torch.tensor([state], device=self.device, dtype=torch.float32)
        dist, value = self.model(state)
        action = dist.sample().item()
        return action, value, dist

    def update(self, values, next_values, step_rewards, log_probs, mask_dones, entropy): # 利用一回合数据进行更新
        expected_values = []
        advantages = []
        actor_losses = []
        critic_losses = []
        for step in range(len(step_rewards)):
            expected_values.append(step_rewards[step].item() + self.gamma * next_values[step].squeeze().item() * mask_dones[step].squeeze().item()) 
            advantages.append(expected_values[step] - values[step].item())
            actor_losses.append(-advantages[step] * log_probs[step].item())
            critic_losses.append(nn.MSELoss()(values[step].squeeze(), torch.tensor([expected_values[step]]).to(self.device)).cpu().detach().numpy())
        actor_loss = mean(actor_losses)
        critic_loss = mean(critic_losses)
        self.loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

    def save(self, path):
        model_checkpoint = os.path.join(path, self.env+'actor_critic.pt')
        torch.save(self.model.state_dict(), model_checkpoint)
        print('Model Saved!')

    def load(self, path):
        model_checkpoint = os.path.join(path, self.env+'actor_critic.pt')
        self.model.load_state_dict(torch.load(model_checkpoint))
        print('Model Loaded!')
Esempio n. 7
0
def train(rank, shared_model, optimizer):
    """
    :param rank: worker-ID
    :param shared_model: model to sync between workers
    :param optimizer:
    :return:
    """
    # torch.manual_seed(SEED + rank)
    ac_steps = 20  # The amount of steps before you review
    max_episode_length = 10000  # The game will stop after this amount of time and maybe re run the game?
    gamma = 0.99
    tau = 1.0
    max_grad_norm = 50.0  # Limit the direction of gradient travel within the queue. Anything outside the queue is cut
    checkpoint_n = 20  # To see the model after this many n. Can increase this number if have a shit comp

    env = create_atari_env(
        romname
    )  # enage game. romname is depending on the game of your choice.
    env.seed(SEED + rank)  # For the problem to occur again? LOOK THIS UP
    state = env.reset()
    # Allow torch to handle pixel data. Don't understrand squeeze. FloatTensor - Tensor is an array, therefore array of float.
    state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor),
                     requires_grad=False)
    # Selecting model, with this size of input and that kind of output
    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    t = 0
    done = True  # Starting from a state when gameover is true!
    episodes = 0
    reward_sum = 0
    reward_sum1 = 0
    start_time = time.time()
    best_reward = -999
    isbest = 0
    cx = hx = None
    while True:
        model.load_state_dict(shared_model.state_dict(
        ))  # Pull the up to date model from the shared model
        if done:  # need to reset LSTM cell's input
            # the LSTM units need their own output to feed into next step
            # input (hence the name of the kind: recurrent neural nets).
            # At the beginning of an episode, to get things started,
            # we need to allocate some initial values in the required format,
            # i.e. the same size as the output of the layer.
            #
            # see http://pytorch.org/docs/master/_modules/torch/nn/modules/rnn.html#LSTM
            # for details
            #
            # Optionally, you can remove LSTM to simplify the code
            # Think: what is the possible loss?
            cx = Variable(torch.zeros(1, 256)).type(
                FloatTensor
            )  # torch.zeros - setting the values to all zeros since there's nothing there yet
            hx = Variable(torch.zeros(1, 256)).type(FloatTensor)
        else:
            cx = Variable(
                cx.data)  # takes the last computed value for the next input
            hx = Variable(
                hx.data
            )  # basically this is to detach from previous comp graph

        states = []
        values = []
        log_probs = []
        rewards = []
        entropies = []

        for i in range(ac_steps):  # Running through the 20 steps
            t += 1
            v, logit, (hx, cx) = model(
                (state, (hx, cx))
            )  # When you run model, it will return you 4 values -> store those 4 values in v, logit, etc.
            states.append(state)
            prob = F.softmax(logit)  # The gradient descent thing
            log_prob = F.log_softmax(
                logit)  # Do it again, a lot to make sure its correct
            entropy = -(log_prob * prob).sum(
                1, keepdim=True
            )  # To increase diversity of our choice (part of e-greedy?)
            entropies.append(entropy)

            # detach - anything compute with pytorch will drag a trail behind it. When get gradient descent, the calculation will race with the result. We do not want the descent to chase it randomly, so we just detach it. !Do not need to modify this function when modify the code.
            action = prob.multinomial().detach(
            )  # detach -- so the backprob will NOT go through multinomial()
            # use the current action as an index to get the
            # corresponding log probability
            log_prob = log_prob.gather(
                1, action
            )  # allow you to simultenously take probability of many actions.

            action = action.data[
                0,
                0]  # Extract the variables out of the integer. Turning it from a torch integer to a "normal" integer
            # Accept what was given by the action, does it things? and the env will return the 4 following; state, reward, done
            # _ is something that we don't care about but since env.step is returning 4 values so we just have to have something to take it.
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            reward_sum1 += reward  # reason why store reward sum twice just for re-assurance
            done = (done or t >= max_episode_length)
            if done:
                t_ = t
                t = 0
                state = env.reset()
                episodes += 1
                if episodes % 10 == 0:
                    time_str = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print("Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {}".format(time_str, rank, episodes,
                                                     reward_sum / 10.0, t_))
                    reward_sum = 0.0

                if episodes % checkpoint_n == 0:
                    ave_reward = reward_sum1 / checkpoint_n
                    if best_reward < ave_reward:
                        isbest = 1
                        best_reward = ave_reward

                    print("Saving checkpoint Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {} best_reward {}".format(
                              get_elapsed_time_str(), rank, episodes,
                              ave_reward, t_, best_reward))
                    checkpoint_fname = os.path.join(
                        args.savedir,
                        args.rom + '_worker' + str(rank) + '_' + str(episodes))
                    save_checkpoint(
                        {
                            'epoch': episodes,
                            'average_reward': ave_reward,
                            'time': time.time(),
                            'state_dict': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                        }, isbest, checkpoint_fname)
                    reward_sum1 = 0.0

            state = Variable(
                torch.from_numpy(state).unsqueeze(0).type(FloatTensor),
                requires_grad=False)
            reward = max(min(reward, 1), -1)
            values.append(v)
            log_probs.append(log_prob)  # Keep record
            rewards.append(reward)

            if done:
                break

        # We reach here because either
        # i) an episode ends, such as game over
        # ii) we have explored certain steps into the future and now it is
        #     time to look-back and summerise the
        if done:
            R = torch.zeros(1, 1).type(
                FloatTensor
            )  # If game over, the game over stage receive a reward of 0
        else:
            value, _, _ = model(
                (state, (hx, cx))
            )  # if its not game over, then we will use the model to evaluate the reward
            R = value.data

        values.append(Variable(R))
        critic_loss = 0
        actor_loss = 0
        R = Variable(R)
        gae = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]  # R - longterm reward
            advantage = R - values[
                i]  # type: Variable, advantage against the average

            # Compare the actual long-term reward. Note: we are reversing the
            # experience of a complete trajectory. If the full length is 100
            # (time indexes are among 0, 1, 2, ..., 99), and now i=50, that means
            # we have processed all information in steps, 51, 52, ..., 99
            # and R will contain the actual long term reward at time step 51 at
            # the beginning of this step. The above computation injects the reward
            # information in step 50 to R. Now R is the long-term reward at this
            # step.
            #
            # So-called advantage is then the "unexpected gain/loss". It forms the base
            # of evaluating the action taken at this step (50).
            #
            # critic_loss accumulates those "exceptional gain/loss" so that later we will
            # adjust our expectation for each state and reduce future exceptions (to better
            # evaluate actions, say, the advantage agains expectation is only meaningful
            # when the expectation itself is meaningful).
            critic_loss += 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            # see https://arxiv.org/abs/1506.02438
            # we can use advantage in the computation of the direction to adjust policy,
            # but the manipulation here improves stability (as claims by the paper).
            #
            # Note advantage implicitly contributes to GAE, since it helps
            # achieve a good estimation of state-values.
            td_error = rewards[i] + gamma * values[i + 1].data - values[i].data
            gae = gae * gamma * tau + td_error

            # log_probs[i] is the log-probability(action-taken). If GAE is great, that
            # means the choice we had made was great, and we want to make the same
            # action decision in future -- make log_probs[i] large. Otherwise,
            # we add log_probs to our regret and will be less likely to take the same
            # action in future.
            #
            # entropy means the variety in a probabilistic distribution,
            # to encourage big entropies is to make more exploration.
            actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i])

        optimizer.zero_grad(
        )  # Applied the gradient to the parameter (back-propagation will get you good stuff from gradient)
        total_loss = actor_loss + critic_loss * 0.5  # type: Variable
        total_loss.backward()  # error occur, back propagation
        # this is to improve stability
        torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
        ensure_shared_grads(
            model, shared_model)  # Push each updated model to the shared model
        optimizer.step()
Esempio n. 8
0
            rewards = trajectory_collector.scores_by_episode[n_episodes : ]

            # record the number of "dones" per trajectory
            writer.add_scalar("episodes_per_trajectory", len(rewards), step)
            step += 1

            end_time = time.time()
            for idx_r, reward in enumerate(rewards):
                mean_reward = reward_tracker.reward(reward, n_episodes + idx_r, end_time - start if start is not None else 0)
                
                # we switch LR to 1e-4 in the middle
                scheduler.step()

                # keep current spectacular scores
                if n_episodes > 0 and (reward > max_score or (n_episodes + idx_r) % SAVE_EVERY == 0):
                    torch.save(policy.state_dict(), os.path.join(ckpt_path, f'checkpoint_actor_{reward:.03f}.pth'))
                    max_score = reward

                if mean_reward is not None and mean_reward >= SOLVED_SCORE:
                    torch.save(policy.state_dict(), os.path.join(ckpt_path, f'checkpoint_actor_{mean_reward:.03f}.pth'))
                    solved_episode = n_episodes + idx_r - AVG_WIN - 1
                    print(f"Solved in {solved_episode if solved_episode > 0 else n_episodes + idx_r} episodes")
                    solved = True
                    break

            if solved:
                break

            start = time.time()
            # train agents in a round-robin for the number of epochs
            for epoch in range(EPOCHS):
Esempio n. 9
0
def main():
    # 确定神经网络计算设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 构建神经网络
    net = ActorCritic()
    net = net.to(device)

    # 准备优化器
    optimizer = torch.optim.Adam(net.parameters(), lr=3e-4)

    # 准备环境
    envs = Envs(NUM_WORKERS, gamma=GAMMA)

    # 开始训练
    for episode in range(EPISODES):

        # 从多个环境采集一回合数据
        net.eval()
        with torch.no_grad():
            states = envs.reset()
            done = False
            while not done:
                states = states.to(device)
                _, policys = net(states)
                policys = policys.cpu()  # 移到CPU上处理比较好
                # 不能下的位置概率填 0
                for i in range(NUM_WORKERS):
                    if envs.reversis[i].next != 0:
                        for y, x in itertools.product(range(SIZE), repeat=2):
                            if not envs.reversis[i].good[y][x]:
                                policys[i][y * SIZE + x] = 0.
                            else:
                                policys[i][y * SIZE + x] += 1e-8  # 防止概率全为 0
                actions = Categorical(probs=policys).sample()
                done, states = envs.step(actions)

        envs.setReturn()
        data = EpisodeData(envs.readHistory())
        loader = DataLoader(data,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=2)

        # 训练网络
        net.train()

        # 相关指标
        value_loss_total = 0.
        entropy_total = 0.

        for states, actions, Returns in loader:
            states, actions, Returns = states.to(device), actions.to(
                device), Returns.to(device)
            values, policys = net(states)

            dist = Categorical(probs=policys)
            action_log_probs = dist.log_prob(actions).view(-1, 1)
            dist_entropy = dist.entropy().mean()  # 我们希望分布的熵更大些,保持模型的探索性

            advantages = Returns.view(-1, 1) - values

            value_loss = advantages.pow(2).mean()
            action_loss = -(advantages.detach() * action_log_probs).mean()

            optimizer.zero_grad()
            (VALUE_LOSS_COEF * value_loss + action_loss -
             ENTROPY_LOSS_COEF * dist_entropy).backward()
            optimizer.step()

            value_loss_total += value_loss.item()
            entropy_total += dist_entropy.item()

        print('Episode: {:>10d}, Value Loss: {:g}, Entropy: {:g}'.format(
            episode, value_loss_total / len(loader),
            entropy_total / len(loader)),
              flush=True)

        if episode != 0 and episode % SAVE_INTERVAL == 0:
            if not os.path.isdir('models'):
                os.mkdir('models')
            torch.save(net.state_dict(),
                       'models/{}.pt'.format(episode // SAVE_INTERVAL))
Esempio n. 10
0
        advantages = returns - values

        actor_loss = -(log_probs * advantages.detach()).mean()
        critic_loss = advantages.pow(2).mean()
        entropy_loss = entropies.mean()

        loss = args.actor_loss_coefficient * actor_loss + \
               args.critic_loss_coefficient * critic_loss - \
               args.entropy_loss_coefficient * entropy_loss

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(actor_critic.parameters(),
                                       args.max_grad_norm)
        optimizer.step()

        if len(rewards) > 1:
            end = time.time()
            total_num_steps = (episode_n +
                               1) * args.num_episodes * args.num_steps
            print("********************************************************")
            print("Episode: {0}, total steps: {1}".format(
                episode_n, total_num_steps))
            print("Episode rewards: {:.1f}".format(np.sum(rewards)))
            print("Actor loss: {:.5f}, Critic loss: {:.5f}, Entropy: {:.5f}".
                  format(actor_loss.item(), critic_loss.item(),
                         entropy_loss.item()))
            print("********************************************************")
        if episode_n % args.storage_freq == 0:
            torch.save(actor_critic.state_dict(),
                       args.storage_path + 'a2c_model.pt')
Esempio n. 11
0
episode_length = 1
while True:

    if episode_length % steps == 0:
        model.low_lr(rate)

    if (episode_length % 1000 == 0) and (episode_length > 20000):
        if dataset == 'cifar':
            model.eval()
            map = test_util.test(Dtest, model, batch_size, bit_len)
            file = open(logpath, "a")
            file.write('#### map=' + str(map) + '\n')
            file.close()
        path = checkpoint_path + '/' + str(episode_length) + '.model'
        torch.save(model.state_dict(), path)

    model.train()

    if dataset == 'cifar':
        ori, pos, neg = traintest.get_batch_cifar_nus(batch_size)
    else:
        ori, pos, neg = traintest.get_batch_flk_nus(batch_size)

    ori = Variable(ori).cuda()
    pos = Variable(pos).cuda()
    neg = Variable(neg).cuda()

    hash_o = Variable(torch.zeros(batch_size, 1).cuda())
    hash_p = Variable(torch.zeros(batch_size, 1).cuda())
    hash_n = Variable(torch.zeros(batch_size, 1).cuda())
Esempio n. 12
0
def train(args, scorer, summary_writer=None):
    device = args.device
    env = create_crop_env(args, scorer)

    model = ActorCritic(args).to(device)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # import pdb; pdb.set_trace();
    training_log_file = open(os.path.join(
        args.model_save_path, 'training.log'), 'w')
    validation_log_file = open(os.path.join(
        args.model_save_path, 'validation.log'), 'w')

    training_log_file.write('Epoch,Cost\n')
    validation_log_file.write('Epoch,Cost\n')

    for train_iter in range(args.n_epochs):
        episode = BatchEpisodes(batch_size=args.batch_size, gamma=args.gamma, device=device)

        for _ in range(args.batch_size):
            done = True
            observation_np = env.reset()

            observations_np, rewards_np, actions_np, hs_ts, cs_ts = [], [], [], [], []
            cx = torch.zeros(1, args.hidden_dim).to(device)
            hx = torch.zeros(1, args.hidden_dim).to(device)
            
            for step in range(args.num_steps):
                observations_np.append(observation_np[0])
                hs_ts.append(hx)
                cs_ts.append(cx)

                with torch.no_grad():
                    observation_ts = torch.from_numpy(observation_np).to(device)
                    value_ts, logit_ts, (hx, cx) = model((observation_ts,
                                                (hx, cx)))       
                    prob = F.softmax(logit_ts, dim=-1)         
                    action_ts = prob.multinomial(num_samples=1).detach()
                
                action_np = action_ts.cpu().numpy()
                actions_np.append(action_np[0][0])
                observation_np, reward_num, done, _ = env.step(action_np)
                if step == args.num_steps - 1:
                    reward_num = 0 if done else value_ts.item()
                rewards_np.append(reward_num)

                if done:
                    break

            observations_np, actions_np, rewards_np = \
                map(lambda x: np.array(x).astype(np.float32), [observations_np, actions_np, rewards_np])
            episode.append(observations_np, actions_np, rewards_np, hs_ts, cs_ts)

        log_probs = []
        values = []
        entropys = []
        for i in range(len(episode)):
            (hs_ts, cs_ts) = episode.hiddens[0][i], episode.hiddens[1][i]
            value_ts, logit_ts, (_, _) = model((episode.observations[i], (hs_ts, cs_ts)))
            prob = F.softmax(logit_ts, dim=-1)
            log_prob = F.log_softmax(logit_ts, dim=-1)
            entropy = -(log_prob * prob).sum(1)
            log_prob = log_prob.gather(1, episode.actions[i].unsqueeze(1).long())
            log_probs.append(log_prob)
            entropys.append(entropy)
            values.append(value_ts)

        log_probs_ts = torch.stack(log_probs).squeeze(2)
        values_ts = torch.stack(values).squeeze(2)
        entropys_ts = torch.stack(entropys)

        advantages_ts = episode.gae(values_ts)
        advantages_ts = weighted_normalize(advantages_ts, weights=episode.mask)
        policy_loss = - weighted_mean(log_probs_ts * advantages_ts, dim=0,
                weights=episode.mask)
        # import pdb; pdb.set_trace();
        value_loss = weighted_mean((values_ts - episode.returns).pow(2), dim=0,
                weights = episode.mask)
        entropy_loss = - weighted_mean(entropys_ts, dim=0,
                weights = episode.mask)
        
        optimizer.zero_grad()
        tot_loss = policy_loss + entropy_loss + args.value_loss_coef * value_loss
        tot_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        print("Epoch [%2d/%2d] : Tot Loss: %5.5f, Policy Loss: %5.5f, Value Loss: %5.5f, Entropy Loss: %5.5f" %
              (train_iter, args.n_epochs, tot_loss.item(), policy_loss.item(), value_loss.item(), entropy_loss.item()))
        # print("Train_iter: ", train_iter, " Total Loss: ", tot_loss.item(), " Value Loss: ", value_loss.item(), " Policy Loss: ", policy_loss.item(), "Entropy Loss: ", entropy_loss.item())
        if summary_writer:
            summary_writer.add_scalar('loss_policy', policy_loss.item(), train_iter)
            summary_writer.add_scalar('loss_value', value_loss.item(), train_iter)
            summary_writer.add_scalar('loss_entropy', entropy_loss.item(), train_iter)
            summary_writer.add_scalar('loss_tot', tot_loss.item(), train_iter)
        train_iter += 1

        if (train_iter + 1) % args.save_per_epoch == 0:
            torch.save(model.state_dict(), os.path.join(args.model_save_path,
                                                        'model_{}_{}.pth').format(train_iter, tot_loss.item()))

        training_log_file.write('{},{}\n'.format(train_iter, tot_loss.item()))
        validation_log_file.write('{},{}\n'.format(train_iter, 0))
        training_log_file.flush()
        validation_log_file.flush()

    training_log_file.close()
    validation_log_file.close()
Esempio n. 13
0
class A3C():
    '''Implementation of N-step Asychronous Advantage Actor Critic'''
    def __init__(self, args, env, train=True):
        self.args = args
        self.set_random_seeds()
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        # Create the environment.
        self.env = gym.make(env)
        self.environment_name = env

        # Setup model.
        self.policy = ActorCritic(4, self.env.action_space.n)
        self.policy.apply(self.initialize_weights)

        # Setup critic model.
        self.critic = ActorCritic(4, self.env.action_space.n)
        self.critic.apply(self.initialize_weights)

        # Setup optimizer.
        self.eps = 1e-10  # To avoid divide-by-zero error.
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=args.policy_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=args.critic_lr)

        # Model weights path.
        self.timestamp = datetime.now().strftime(
            'a2c-breakout-%Y-%m-%d_%H-%M-%S')
        self.weights_path = 'models/%s/%s' % (self.environment_name,
                                              self.timestamp)

        # Load pretrained weights.
        if args.weights_path: self.load_model()
        self.policy.to(self.device)
        self.critic.to(self.device)

        # Video render mode.
        if args.render:
            self.policy.eval()
            self.generate_episode(render=True)
            self.plot()
            return

        # Data for plotting.
        self.rewards_data = []  # n * [epoch, mean(returns), std(returns)]

        # Network training mode.
        if train:
            # Tensorboard logging.
            self.logdir = 'logs/%s/%s' % (self.environment_name,
                                          self.timestamp)
            self.summary_writer = SummaryWriter(self.logdir)

            # Save hyperparameters.
            with open(self.logdir + '/training_parameters.json', 'w') as f:
                json.dump(vars(self.args), f, indent=4)

    def initialize_weights(self, layer):
        if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def set_random_seeds(self):
        torch.manual_seed(self.args.random_seed)
        np.random.seed(self.args.random_seed)
        torch.backends.cudnn.benchmark = True

    def save_model(self, epoch):
        '''Helper function to save model state and weights.'''
        if not os.path.exists(self.weights_path):
            os.makedirs(self.weights_path)
        torch.save(
            {
                'policy_state_dict': self.policy.state_dict(),
                'policy_optimizer': self.policy_optimizer.state_dict(),
                'critic_state_dict': self.critic.state_dict(),
                'critic_optimizer': self.critic_optimizer.state_dict(),
                'rewards_data': self.rewards_data,
                'epoch': epoch
            }, os.path.join(self.weights_path, 'model_%d.h5' % epoch))

    def load_model(self):
        '''Helper function to load model state and weights. '''
        if os.path.isfile(self.args.weights_path):
            print('=> Loading checkpoint', self.args.weights_path)
            self.checkpoint = torch.load(self.args.weights_path)
            self.policy.load_state_dict(self.checkpoint['policy_state_dict'])
            self.policy_optimizer.load_state_dict(
                self.checkpoint['policy_optimizer'])
            self.critic.load_state_dict(self.checkpoint['critic_state_dict'])
            self.critic_optimizer.load_state_dict(
                self.checkpoint['critic_optimizer'])
            self.rewards_data = self.checkpoint['rewards_data']
        else:
            raise Exception('No checkpoint found at %s' %
                            self.args.weights_path)

    def train(self):
        '''Trains the model on a single episode using REINFORCE.'''
        for epoch in range(self.args.num_episodes):
            # Generate epsiode data.
            returns, log_probs, value_function, train_rewards = self.generate_episode(
            )
            self.summary_writer.add_scalar('train/cumulative_rewards',
                                           train_rewards, epoch)
            self.summary_writer.add_scalar('train/trajectory_length',
                                           returns.size()[0], epoch)

            # Compute loss and policy gradient.
            self.policy_optimizer.zero_grad()
            policy_loss = ((returns - value_function.detach()) *
                           -log_probs).mean()
            policy_loss.backward()
            self.policy_optimizer.step()

            self.critic_optimizer.zero_grad()
            critic_loss = F.mse_loss(returns, value_function)
            critic_loss.backward()
            self.critic_optimizer.step()

            # Test the model.
            if epoch % self.args.test_interval == 0:
                self.policy.eval()
                print('\nTesting')
                rewards = [
                    self.generate_episode(test=True)
                    for epoch in range(self.args.test_episodes)
                ]
                rewards_mean, rewards_std = np.mean(rewards), np.std(rewards)
                print(
                    'Test Rewards (Mean): %.3f | Test Rewards (Std): %.3f\n' %
                    (rewards_mean, rewards_std))
                self.rewards_data.append([epoch, rewards_mean, rewards_std])
                self.summary_writer.add_scalar('test/rewards_mean',
                                               rewards_mean, epoch)
                self.summary_writer.add_scalar('test/rewards_std', rewards_std,
                                               epoch)
                self.policy.train()

            # Logging.
            if epoch % self.args.log_interval == 0:
                print(
                    'Epoch: {0:05d}/{1:05d} | Policy Loss: {2:.3f} | Value Loss: {3:.3f}'
                    .format(epoch, self.args.num_episodes, policy_loss,
                            critic_loss))
                self.summary_writer.add_scalar('train/policy_loss',
                                               policy_loss, epoch)
                self.summary_writer.add_scalar('train/critic_loss',
                                               critic_loss, epoch)

            # Save the model.
            if epoch % self.args.save_interval == 0:
                self.save_model(epoch)

        self.save_model(epoch)
        self.summary_writer.close()

    def generate_episode(self,
                         gamma=0.99,
                         test=False,
                         render=False,
                         max_iters=10000):
        '''
        Generates an episode by executing the current policy in the given env.
        Returns:
        - a list of states, indexed by time epoch
        - a list of actions, indexed by time epoch
        - a list of cumulative discounted returns, indexed by time epoch
        '''
        iters = 0
        done = False
        state = self.env.reset()

        # Set video save path if render enabled.
        if render:
            save_path = 'videos/%s/epoch-%s' % (self.environment_name,
                                                self.checkpoint['epoch'])
            if not os.path.exists(save_path): os.makedirs(save_path)
            monitor = gym.wrappers.Monitor(self.env, save_path, force=True)

        batches = []
        states = [torch.zeros(84, 84, device=self.device).float()] * 3
        rewards, returns = [], []
        actions, log_probs = [], []

        while not done:
            # Run policy on current state to log probabilities of actions.
            states.append(
                torch.tensor(preprocess(state),
                             device=self.device).float().squeeze(0))
            batches.append(torch.stack(states[-4:]))
            action_probs = self.policy.forward(
                batches[-1].unsqueeze(0)).squeeze(0)

            # Sample action from the log probabilities.
            if test and self.args.det_eval: action = torch.argmax(action_probs)
            else:
                action = torch.argmax(
                    torch.distributions.Multinomial(
                        logits=action_probs).sample())
            actions.append(action)
            log_probs.append(action_probs[action])

            # Run simulation with current action to get new state and reward.
            if render: monitor.render()
            state, reward, done, _ = self.env.step(action.cpu().numpy())
            rewards.append(reward)

            # Break if the episode takes too long.
            iters += 1
            if iters > max_iters: break

        # Save video and close rendering.
        cum_rewards = np.sum(rewards)
        if render:
            monitor.close()
            print('\nCumulative Rewards:', cum_rewards)
            return

        # Return cumulative rewards for test mode.
        if test: return cum_rewards

        # Flip rewards from T-1 to 0.
        rewards = np.array(rewards) / self.args.reward_normalizer

        # Compute value.
        values = []
        minibatches = torch.split(torch.stack(batches), 256)
        for minibatch in minibatches:
            values.append(
                self.critic.forward(minibatch, action=False).squeeze(1))
        values = torch.cat(values)
        discounted_values = values * gamma**self.args.n

        # Compute the cumulative discounted returns.
        n_step_rewards = np.zeros((1, self.args.n))
        for i in reversed(range(rewards.shape[0])):
            if i + self.args.n >= rewards.shape[0]:
                V_end = 0
            else:
                V_end = discounted_values[i + self.args.n]
            n_step_rewards[0, :-1] = n_step_rewards[0, 1:] * gamma
            n_step_rewards[0, -1] = rewards[i]

            n_step_return = torch.tensor(
                n_step_rewards.sum(), device=self.device).unsqueeze(0) + V_end
            returns.append(n_step_return)

        # Normalize returns.
        # returns = torch.stack(returns)
        # mean_return, std_return = returns.mean(), returns.std()
        # returns = (returns - mean_return) / (std_return + self.eps)

        return torch.stack(returns[::-1]).detach().squeeze(1), torch.stack(
            log_probs), values.squeeze(), cum_rewards

    def plot(self):
        # Save the plot.
        filename = os.path.join(
            'plots',
            *self.args.weights_path.split('/')[-2:]).replace('.h5', '.png')
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))

        # Make error plot with mean, std of rewards.
        data = np.asarray(self.rewards_data)
        plt.errorbar(data[:, 0],
                     data[:, 1],
                     data[:, 2],
                     lw=2.5,
                     elinewidth=1.5,
                     ecolor='grey',
                     barsabove=True,
                     capthick=2,
                     capsize=3)
        plt.title('Cumulative Rewards (Mean/Std) Plot for A3C Algorithm')
        plt.xlabel('Number of Episodes')
        plt.ylabel('Cumulative Rewards')
        plt.grid()
        plt.savefig(filename, dpi=300)
        plt.show()
Esempio n. 14
0
def train(rank, shared_model, optimizer):
    """
    :param rank: worker-ID
    :param shared_model: model to sync between workers
    :param optimizer:
    :return:
    """
    # torch.manual_seed(SEED + rank)
    ac_steps = 20
    max_episode_length = 10000
    gamma = 0.99
    tau = 1.0
    max_grad_norm = 50.0
    checkpoint_n = 20

    env = create_atari_env(romname)
    env.seed(SEED + rank)
    state = env.reset()
    state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    t = 0
    done = True
    episodes = 0
    reward_sum = 0
    reward_sum1 = 0
    start_time = time.time()
    best_reward = -999
    isbest = 0
    cx = hx = None
    while True:
        model.load_state_dict(shared_model.state_dict())
        if done:  # need to reset LSTM cell's input
            cx = Variable(torch.zeros(1, 256)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 256)).type(FloatTensor)
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)  # basically this is to detach from previous comp graph

        states = []
        values = []
        log_probs = []
        rewards = []
        entropies = []

        for i in range(ac_steps):
            t += 1
            v, logit, (hx, cx) = model((state, (hx, cx)))
            states.append(state)
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial().detach()  # detach -- so the backprob will NOT go through multinomial()
            log_prob = log_prob.gather(1, action)
            action = action.data[0, 0]
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            reward_sum1 += reward
            done = done or t >= max_episode_length
            if done:
                t_ = t
                t = 0
                state = env.reset()
                episodes += 1
                if episodes % 10 == 0:
                    time_str = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print("Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {}".
                          format(time_str, rank, episodes, reward_sum / 10.0, t_))
                    reward_sum = 0.0

                if episodes % checkpoint_n == 0:
                    ave_reward = reward_sum1 / checkpoint_n
                    if best_reward < ave_reward:
                        isbest = 1
                        best_reward = ave_reward

                    print("Saving checkpoint Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {} best_reward {}".
                          format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward))
                    checkpoint_fname = os.path.join(
                        args.savedir,
                        args.rom + '_worker' + str(rank) + '_' + str(episodes))
                    save_checkpoint({'epoch': episodes,
                                     'average_reward': ave_reward,
                                     'time': time.time(),
                                     'state_dict': model.state_dict(),
                                     'optimizer': optimizer.state_dict(),
                                     }, isbest, checkpoint_fname)
                    reward_sum1 = 0.0

            state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
            reward = max(min(reward, 1), -1)
            values.append(v)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        # We reach here because either
        # i) an episode ends, such as game over
        # ii) we have explored certain steps into the future and now it is
        #     time to look-back and summerise the
        if done:
            R = torch.zeros(1, 1).type(FloatTensor)
        else:
            value, _, _ = model((state, (hx, cx)))
            R = value.data

        values.append(Variable(R))
        critic_loss = 0
        actor_loss = 0
        R = Variable(R)
        gae = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            advantage = R - values[i]  # type: Variable
            critic_loss += 0.5 * advantage.pow(2)
            td_error = rewards[i] + gamma * values[i + 1].data - values[i].data
            gae = gae * gamma * tau + td_error
            actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i])

        optimizer.zero_grad()
        total_loss = actor_loss + critic_loss * 0.5  # type: Variable
        total_loss.backward()  # error occur
        torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 15
0
def test(rank, args, shared_model):

    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    (f, ckpt_path), (log_dir, ckpt_dir) = setup(args)
    if args.task == 'eval':
    	env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(args.env_name), force=True)
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    if args.task == 'eval':
	reward_list = []
    done = True
    #env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(args.env_name), force=True)
    start_time = time.time()
    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_i = 0
    episode_length = 0
    try:
    	while True:
            episode_length += 1
            # Sync with the shared model
            if done:
            	model.load_state_dict(shared_model.state_dict())
            	cx = Variable(torch.zeros(1, 128), volatile=True)
            	hx = Variable(torch.zeros(1, 128), volatile=True)
            else:
            	cx = Variable(cx.data, volatile=True)
            	hx = Variable(hx.data, volatile=True)

	    # for mujoco, env returns DoubleTensor
            value, mu, sigma_sq, (hx, cx) = model(
                (Variable(state.float().unsqueeze(0).float()), (hx, cx)))
	    sigma_sq = F.softplus(sigma_sq)
            eps = torch.randn(mu.size())
            # calculate the probability
            action = (mu + sigma_sq.sqrt()*Variable(eps)).data

            state, reward, done, _ = env.step(action[0, 0])
	    if args.display:
		env.render()
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward

            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
            	done = True

            if done:
		episode_i += 1
		if args.task == 'eval':
		    reward_list.append(reward_sum)
		if args.task == 'eval' and episode_i >= 100:
		    print "Testing over %d episodes, Average reward = %f" % \
					(episode_i, sum(reward_list)/episode_i,)
		    break
		if episode_i%args.save_freq == 0:
		    torch.save(model.state_dict(), os.path.join(ckpt_dir, args.env_name+\
				"."+args.model_name+"."+str(episode_i)+".pkl"))
	    	info_str = "Time {}, episode reward {}, episode length {}".format(
                	time.strftime("%Hh %Mm %Ss",time.gmtime(time.time() - start_time)),
                	reward_sum, episode_length)
	        print(info_str)
	        f.write(info_str+'\n')
                reward_sum = 0
            	episode_length = 0
            	actions.clear()
            	state = env.reset()
		if args.task == 'train':
            	    time.sleep(60)

            state = torch.from_numpy(state)
    except KeyboardInterrupt:
	env.close()
	f.close()
	torch.save(model.state_dict(), ckpt_path)
Esempio n. 16
0
File: main.py Progetto: PGKANG/PPO
def main(args):
    current_dir = os.path.abspath('.')
    exp_dir = current_dir + '/results/exp/'
    model_dir = current_dir + '/results/model/'
    os.makedirs(exp_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)
    writer = SummaryWriter(exp_dir)

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.device == 'cuda':
        torch.cuda.manual_seed(args.seed)

    sampler = MemorySampler(args)
    num_inputs, num_actions = sampler.get_space

    network = ActorCritic(num_inputs, num_actions, layer_norm=args.layer_norm)
    optimizer = opt.Adam(network.parameters(), lr=args.lr)

    clip_now = args.clip

    for i_episode in range(args.num_episode):
        # step1: perform current policy to collect trajectories
        # this is an on-policy method!
        memory = sampler.sample(network)

        # step2: extract variables from trajectories
        batch = memory.sample()
        batch_size = len(memory)

        rewards = torch.Tensor(batch.reward)
        values = torch.Tensor(batch.value)
        masks = torch.Tensor(batch.mask)
        actions = torch.Tensor(batch.action)
        observations = torch.Tensor(batch.observation)
        oldlogproba = torch.Tensor(batch.logproba)

        returns = torch.Tensor(batch_size)
        deltas = torch.Tensor(batch_size)
        advantages = torch.Tensor(batch_size)

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(batch_size)):
            returns[i] = rewards[i] + args.gamma * prev_return * masks[i]
            deltas[i] = rewards[
                i] + args.gamma * prev_value * masks[i] - values[i]
            # ref: https://arxiv.org/pdf/1506.02438.pdf (generalization advantage estimate)
            advantages[i] = deltas[
                i] + args.gamma * args.lamda * prev_advantage * masks[i]

            prev_return = returns[i]
            prev_value = values[i]
            prev_advantage = advantages[i]
        if args.advantage_norm:
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             args.EPS)

        observations = observations.to(args.device)
        actions = actions.to(args.device)
        oldlogproba = oldlogproba.to(args.device)
        advantages = advantages.to(args.device)
        returns = returns.to(args.device)

        for i_epoch in range(
                int(args.num_epoch * batch_size / args.minibatch_size)):
            # sample from current batch
            minibatch_ind = np.random.choice(batch_size,
                                             args.minibatch_size,
                                             replace=False)
            minibatch_observations = observations[minibatch_ind]
            minibatch_actions = actions[minibatch_ind]
            minibatch_oldlogproba = oldlogproba[minibatch_ind]
            minibatch_newlogproba, entropy = network.get_logproba(
                minibatch_observations, minibatch_actions)
            minibatch_advantages = advantages[minibatch_ind]
            minibatch_returns = returns[minibatch_ind]
            minibatch_newvalues = network._forward_critic(
                minibatch_observations).flatten()

            assert minibatch_oldlogproba.shape == minibatch_newlogproba.shape
            ratio = torch.exp(minibatch_newlogproba - minibatch_oldlogproba)
            assert ratio.shape == minibatch_advantages.shape
            surr1 = ratio * minibatch_advantages
            surr2 = ratio.clamp(1 - clip_now,
                                1 + clip_now) * minibatch_advantages
            loss_surr = -torch.mean(torch.min(surr1, surr2))

            # not sure the value loss should be clipped as well
            # clip example: https://github.com/Jiankai-Sun/Proximal-Policy-Optimization-in-Pytorch/blob/master/ppo.py
            # however, it does not make sense to clip score-like value by a dimensionless clipping parameter
            # moreover, original paper does not mention clipped value
            if args.lossvalue_norm:
                minibatch_return_6std = 6 * minibatch_returns.std()
                loss_value = torch.mean(
                    (minibatch_newvalues -
                     minibatch_returns).pow(2)) / minibatch_return_6std
            else:
                loss_value = torch.mean(
                    (minibatch_newvalues - minibatch_returns).pow(2))

            # loss_entropy = torch.mean(torch.exp(minibatch_newlogproba) * minibatch_newlogproba)
            loss_entropy = -torch.mean(entropy)

            total_loss = loss_surr + args.loss_coeff_value * loss_value + args.loss_coeff_entropy * loss_entropy
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

        if args.schedule_clip == 'linear':
            ep_ratio = 1 - (i_episode / args.num_episode)
            clip_now = args.clip * ep_ratio

        if args.schedule_adam == 'linear':
            ep_ratio = 1 - (i_episode / args.num_episode)
            lr_now = args.lr * ep_ratio
            # set learning rate
            # ref: https://stackoverflow.com/questions/48324152/
            for g in optimizer.param_groups:
                g['lr'] = lr_now

        if i_episode % args.log_num_episode == 0:
            mean_reward = (torch.sum(rewards) / memory.num_episode).data
            mean_step = len(memory) // memory.num_episode
            print('Finished episode: {} | Reward: {:.4f} | total_loss = {:.4f} = {:.4f} + {} * {:.4f} + {} * {:.4f}' \
                  .format(i_episode, mean_reward, total_loss.cpu().data, loss_surr.cpu().data,
                          args.loss_coeff_value, loss_value.cpu().data, args.loss_coeff_entropy, loss_entropy.cpu().data), end=' | ')
            print('Step: {:d}'.format(mean_step))
            writer.add_scalar('reward', mean_reward, i_episode)
            writer.add_scalar('total_loss', total_loss.cpu().data, i_episode)
            torch.save(network.state_dict(),
                       model_dir + 'network_{}.pth'.format(i_episode))

    sampler.close()
Esempio n. 17
0
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.lstm_size)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    #actions=deque(maxlen=100)
    episode_length = 0

    currentPath = os.getcwd()
    File = open(currentPath + '/record.txt', 'a+')
    print("\n\n\n\n------------------------------\n\n\n\n\n")
    File.write("\n\n\n\n------------------------------\n\n\n\n\n")
    File.close()

    cnt = 0
    episode_number = 0

    while True:
        env.render()
        cnt = cnt + 1
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
            cx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
        else:
            hx = Variable(hx.data, volatile=True)
            cx = Variable(cx.data, volatile=True)

        #print(state)
        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        #action=prob.max(1)[1].data.numpy()
        action = prob.multinomial().data

        #if(args.env_name=='Breakout-v3'):
        #    state,reward,done,_=env.step(1)
        #     reward_sum+=reward
        #state,reward,done,_ =env.step(action[0,0])
        state, reward, done, _ = env.step(action.numpy())
        done = done  #or episode_length >= args.max_episode_length
        if episode_length >= args.max_episode_length:
            done = True
            reward_sum -= 30
        reward_sum += reward

        #actions.append(action[0,0])
        #if actions.count(actions[0])==actions.maxlen:
        #    done=True
        #if reward!=0:
        #  print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!')

        if done:
            hour = int(
                time.strftime("%H", time.gmtime(time.time() - start_time)))
            _min = int(
                time.strftime("%M", time.gmtime(time.time() - start_time)))

            print("Time {},episode reward {}, episode length {} ".format(
                hour * 60 + _min + args.starttime, reward_sum, episode_length))

            File = open(currentPath + '/record.txt', 'a+')
            File.write(
                "Time {},episode reward {}, episode length {} \n".format(
                    hour * 60 + _min + args.starttime, reward_sum,
                    episode_length))
            File.close()

            reward_sum = 0
            episode_length = 0
            #actions.clear()
            state = env.reset()

            torch.save(model.state_dict(), currentPath + '/A3C.t7')
            episode_number += 1
            time.sleep(60)

        state = torch.from_numpy(state)
Esempio n. 18
0
    if args.checkpoint_path and os.path.isfile(args.checkpoint_path):
        checkpoint = torch.load(args.checkpoint_path)
        counter.value = checkpoint['episodes']
        shared_model.load_state_dict(checkpoint['model'])
        shared_model.share_memory()
        optimizer.load_state_dict(checkpoint['optimizer'])
        optimizer.share_memory()
    else:
        checkpoint = {}

    processes = []

    logging = build_logger(
        lambda: dict(episodes=counter.value,
                     model=shared_model.state_dict(),
                     optimizer=optimizer.state_dict()), checkpoint, args.run,
        args.visdom_port)

    p = mp.Process(target=test,
                   args=(args.num_processes, args, shared_model,
                         (counter, steps,
                          args.max_test_episodes), logging, kill))
    p.start()
    processes.append(p)

    for rank in range(0, args.num_processes):
        p = mp.Process(target=train,
                       args=(rank, args, shared_model, (counter, steps), lock,
                             optimizer, logging, kill))
        p.start()
Esempio n. 19
0
def train(training_scene,
          train_object,
          rank,
          shared_model,
          scheduler,
          counter,
          lock,
          config,
          arguments=dict(),
          optimizer=None):
    torch.manual_seed(arguments['seed'] + rank)
    # To prevent out of memory
    if (arguments['train_cnn'] and rank < 10):
        arguments.update({"gpu_ids": [-1]})

    gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]

    if gpu_id >= 0:
        torch.cuda.manual_seed(arguments['seed'] + rank)

    if optimizer is None:
        optimizer = optim.RMSprop(shared_model.parameters(),
                                  lr=arguments['lr'],
                                  alpha=0.99,
                                  eps=0.1)

    env = AI2ThorDumpEnv(training_scene,
                         train_object,
                         config,
                         arguments,
                         seed=arguments['seed'] + rank)

    state, score, target = env.reset()
    starting = env.current_state_id
    done = True
    print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format(
        rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no'))

    model = ActorCritic(config, arguments, gpu_id)
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            model = model.cuda()
            dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor

    model.train()

    # monitoring
    total_reward_for_num_steps_list = []
    redundancies = []
    success = []
    avg_entropies = []
    learning_rates = []
    dist_to_goal = []

    start = time.time()
    episode_length = 0

    for epoch in range(arguments['num_epochs']):
        # Sync with the shared model
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                model.load_state_dict(shared_model.state_dict())
        else:
            model.load_state_dict(shared_model.state_dict())

        if arguments['lstm']:
            if done:
                cx = torch.zeros(1, 512).type(dtype)
                hx = torch.zeros(1, 512).type(dtype)
            else:
                cx = cx.detach()
                hx = hx.detach()

        if scheduler is not None:
            scheduler.step()
            learning_rates.append(optimizer.param_groups[0]['lr'])

        values = []
        log_probs = []
        rewards = []
        entropies = []
        starting = env.current_state_id

        dist_to_goal.append(
            min([env.shortest[starting][t] for t in env.target_ids]))

        for step in range(arguments['num_iters']):
            episode_length += 1
            if arguments['lstm']:
                value, logit, (hx, cx) = model((state, (hx, cx)), score,
                                               target)
            else:
                value, logit = model(state, score, target)

            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            action_int = action.cpu().numpy()[0][0].item()
            state, score, reward, done = env.step(action_int)

            if done:
                success.append(1)
            elif episode_length >= arguments['max_episode_length']:
                success.append(0)

            done = done or episode_length >= arguments['max_episode_length']

            with lock:
                counter.value += 1

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            ending = env.current_state_id
            if done:
                state, score, target = env.reset()

                print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\
                        .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600))

                episode_length = 0
                break

        if not done:
            success.append(0)

        # No interaction with environment below.
        # Monitoring
        total_reward_for_num_steps_list.append(sum(rewards))
        redundancies.append(step + 1 - env.shortest[ending, starting])
        avg_entropies.append(torch.tensor(entropies).numpy().mean())

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        if not done:  # to change last reward to predicted value to ....
            if arguments['lstm']:
                value, _, (hx, cx) = model((state, (hx, cx)), score, target)
            else:
                value, _ = model(state, score, target)

            R = value.detach()

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        values.append(R)

        policy_loss = 0
        value_loss = 0

        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()

        for i in reversed(range(len(rewards))):

            R = arguments['gamma'] * R + rewards[i]

            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            if arguments['use_gae']:
                # Generalized Advantage Estimation
                delta_t = rewards[i] + arguments['gamma'] * values[
                    i + 1] - values[i]
                gae = gae * arguments['gamma'] * arguments['tau'] + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          arguments['ec'] * entropies[i]

        optimizer.zero_grad()

        (policy_loss + arguments['vc'] * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       arguments['max_grad_norm'])

        ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()

        if (epoch + 1) % 1000 == 0 and np.mean(success[-500:]) >= 0.8 and \
            not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])):
            torch.save(
                model.state_dict(),
                "training-history/{}/net_good.pth".format(arguments['about']))

        if (epoch + 1) % 2000 == 0:
            with open(
                    'training-history/{}/{}_{}_{}.pkl'.format(
                        arguments['about'], training_scene, train_object,
                        rank), 'wb') as f:
                pickle.dump(
                    {
                        "rewards": total_reward_for_num_steps_list,
                        "dist_to_goal": dist_to_goal,
                        "success_rate": success,
                        'redundancies': redundancies,
                        "entropies": avg_entropies,
                        'lrs': learning_rates
                    }, f, pickle.HIGHEST_PROTOCOL)

    torch.save(
        model.state_dict(),
        "training-history/{}/net_{}.pth".format(arguments['about'],
                                                train_object))
Esempio n. 20
0
    def __init__(self, model: ActorCritic, shared_model: ActorCritic):
        self.model = model
        self.shared_model = shared_model

        self.model.load_state_dict(shared_model.state_dict())
Esempio n. 21
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)
    model.eval()

    save_dir = os.path.join('results', args.name)

    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    # stores step, reward, avg_steps and time
    results_dict = {'t': [], 'reward': [], 'avg_steps': [], 'time': []}

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = torch.zeros(1, args.hidden_size)
                        cx = torch.zeros(1, args.hidden_size)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        done, episode_length = False, 0
                        reward_sum = 0

                    # Optionally render validation states
                    if args.render:
                        env.render()

                    # Calculate policy
                    with torch.no_grad():
                        policy, _, _, (hx, cx), _ = model(state, (hx, cx))

                    # Choose action greedily
                    action = policy.max(1)[1][0]

                    # Step
                    state, reward, done, _ = env.step(action.item())
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break
            print(('[{}] Step: {:<' + l +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
                       t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))
            fields = [
                t_start,
                sum(avg_rewards) / args.evaluation_episodes,
                sum(avg_episode_lengths) / args.evaluation_episodes,
                str(datetime.now())
            ]

            # storing data in the dictionary.
            results_dict['t'].append(t_start)
            results_dict['reward'].append(
                sum(avg_rewards) / args.evaluation_episodes)
            results_dict['avg_steps'].append(
                sum(avg_episode_lengths) / args.evaluation_episodes)
            results_dict['time'].append(str(datetime.now()))

            # Dumping the results in pickle format
            with open(os.path.join(save_dir, 'results.pck'), 'wb') as f:
                pickle.dump(results_dict, f)

            # Saving the data in csv format
            with open(os.path.join(save_dir, 'results.csv'), 'a') as f:
                writer = csv.writer(f)
                writer.writerow(fields)

            if args.evaluate:
                return

            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards, save_dir)  # Plot rewards
            torch.save(model.state_dict(),
                       os.path.join(save_dir,
                                    'model.pth'))  # Save model params
            #   torch.save(model.state_dict(), os.path.join(save_dir, 'model_{}.pth'.format(t_start)))  # Save model params
            can_test = False  # Finish testing
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond

    # Dumping the results in pickle format
    with open(os.path.join(save_dir, 'results.pck'), 'wb') as f:
        pickle.dump(results_dict, f)

    env.close()
Esempio n. 22
0
def test(rank, args, shared_model, shared_curiosity, counter, pids, optimizer,
         train_policy_losses, train_value_losses, train_rewards):
    models_dir = os.path.join(args.sum_base_dir, 'models')
    if not os.path.exists(models_dir):
        logging.info("Created models dir")
        os.makedirs(models_dir)

    recordings_dir = os.path.join(args.sum_base_dir, 'recordings')
    if (not os.path.exists(recordings_dir)) and (args.game == 'doom'):
        logging.info("Created recordings dir")
        os.makedirs(recordings_dir)

    videos_dir = os.path.join(args.sum_base_dir, 'videos')
    if (not os.path.exists(videos_dir)) and (args.game == 'atari'):
        logging.info("Created videos dir")
        os.makedirs(videos_dir)

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              rank,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
        env.set_recordings_dir(recordings_dir)
        logging.info("Set recordings dir")
        env.seed(args.seed + rank)
    elif args.game == 'atari':
        env_to_wrap = create_atari_env(args.env_name)
        env_to_wrap.seed(args.seed + rank)
        env = env_to_wrap
    elif args.game == 'picolmaze':
        env_to_wrap = create_picolmaze_env(args.num_rooms)
        env_to_wrap.seed(args.seed + rank)
        env = env_to_wrap

    env.step(0)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)
    curiosity = IntrinsicCuriosityModule(  # ICM
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    model.eval()
    curiosity.eval()  # ICM

    external_reward_sum = 0
    curiosity_reward_sum = 0  # ICM
    curiosity_reward_sum_clipped = 0  # ICM
    inv_loss = torch.tensor(0.0)  # ICM
    forw_loss = torch.tensor(0.0)  # ICM
    curiosity_loss = 0  # ICM
    done = True

    count_done = 0

    start_time = time.time()

    passed_time = 0
    current_counter = 0

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    actions = deque(maxlen=args.max_episode_length_test)
    episode_length = 0
    while True:
        episode_length += 1

        if done:
            passed_time = time.time() - start_time
            current_counter = counter.value

            # Sync with the shared model
            model.load_state_dict(shared_model.state_dict())
            curiosity.load_state_dict(shared_curiosity.state_dict())  # ICM
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)

            if count_done % args.save_video_again_eps == 0:
                if args.game == 'atari':
                    video_dir = os.path.join(
                        videos_dir,
                        'video_' + time.strftime('%Y.%m.%d-%H.%M.%S_') +
                        str(current_counter))
                    if not os.path.exists(video_dir):
                        os.makedirs(video_dir)
                    logging.info("Created new video dir")
                    env = wrappers.Monitor(env_to_wrap, video_dir, force=False)
                    logging.info("Created new wrapper")
                elif args.game == 'doom':
                    env.set_current_counter(current_counter)
                    env.set_record()
                    logging.info("Set new recording")

            state = env.reset()
            state = torch.from_numpy(state)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx)
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].flatten().detach()

        state_old = state  # ICM

        state, external_reward, done, _ = env.step(action)
        state = torch.from_numpy(state)

        # external reward = 0 if ICM-only mode
        # external_reward = external_reward * (1 - args.icm_only)
        external_reward_sum += external_reward

        # <---ICM---
        inv_out, forw_out, curiosity_reward = \
            curiosity(
                state_old.unsqueeze(0), action,
                state.unsqueeze(0))
        # In noreward-rl:
        # self.invloss = tf.reduce_mean(
        #     tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex),
        #     name="invloss")
        # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
        # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
        current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action)
        current_forw_loss = curiosity_reward
        inv_loss += current_inv_loss
        forw_loss += current_forw_loss

        curiosity_reward = args.eta * curiosity_reward
        curiosity_reward_sum += curiosity_reward.detach()
        curiosity_reward_sum_clipped += \
            max(min(curiosity_reward.detach(), args.clip), -args.clip)
        # ---ICM--->

        done = done or episode_length >= args.max_episode_length

        # a quick hack to prevent the agent from stucking
        actions.append(action)
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            # <---ICM---
            inv_loss = inv_loss / episode_length
            forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length

            curiosity_loss = args.lambda_1 * (
                (1 - args.beta) * inv_loss + args.beta * forw_loss)
            # ---ICM--->

            train_policy_loss_mean = sum(train_policy_losses) / \
                len(train_policy_losses)
            train_value_loss_mean = sum(train_value_losses) / \
                len(train_value_losses)
            train_rewards_mean = sum(train_rewards) / \
                len(train_rewards)
            logging.info(
                "\n\nEp {:3d}: time {}, num steps {}, FPS {:.0f}, len {},\n"
                "        total R {:.6f}, train policy loss {:.6f}, train value loss {:.6f},\n"
                "        train mean R {:.6f}, curiosity R {:.3f}, curiosity R clipped {:.3f},\n"
                "        inv loss {:.3f}, forw loss {:.3f}, curiosity loss {:.3f}.\n"
                "".format(
                    count_done,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(passed_time)), current_counter,
                    current_counter / passed_time, episode_length,
                    external_reward_sum, train_policy_loss_mean,
                    train_value_loss_mean, train_rewards_mean,
                    curiosity_reward_sum, curiosity_reward_sum_clipped,
                    inv_loss, forw_loss, curiosity_loss))

            if ((count_done % args.save_model_again_eps == 0)
                    and (optimizer is not None)):
                torch.save(
                    model.state_dict(), models_dir + '/model_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                torch.save(
                    curiosity.state_dict(), models_dir + '/curiosity_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                torch.save(
                    optimizer.state_dict(), models_dir + '/optimizer_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                logging.info("Saved the model")

            tb.log_value('steps_second', current_counter / passed_time,
                         current_counter)
            tb.log_value('reward', external_reward_sum, current_counter)
            tb.log_value('reward_icm', curiosity_reward_sum, current_counter)
            tb.log_value('reward_icm_clipped', curiosity_reward_sum_clipped,
                         current_counter)
            tb.log_value('loss_inv', inv_loss, current_counter)
            tb.log_value('loss_forw', forw_loss, current_counter)
            tb.log_value('loss_curiosity', curiosity_loss, current_counter)
            tb.log_value('loss_train_policy_mean', train_policy_loss_mean,
                         current_counter)
            tb.log_value('loss_train_value_mean', train_value_loss_mean,
                         current_counter)
            tb.log_value('reward_train_mean', train_value_loss_mean,
                         current_counter)

            if args.game == 'atari':
                env.close()  # Close the window after the rendering session
                env_to_wrap.close()
            logging.info("Episode done, close all")

            episode_length = 0
            external_reward_sum = 0
            curiosity_reward_sum = 0  # ICM
            curiosity_reward_sum_clipped = 0  # ICM
            inv_loss = torch.tensor(0.0)  # ICM
            forw_loss = torch.tensor(0.0)  # ICM
            curiosity_loss = 0  # ICM
            actions.clear()

            if count_done >= args.max_episodes:
                for pid in pids:
                    os.kill(pid, signal.SIGTERM)
                env.close()
                os.kill(os.getpid(), signal.SIGKILL)

            count_done += 1
            time.sleep(args.time_sleep)
Esempio n. 23
0
        # actions.append(action)
        
        # state = next_state
        # frame_idx += 1

    # next_state = torch.FloatTensor(next_state).to(device)
    # _, next_value = model(next_state)
    # use last value
    final_value = 40 if won else -40
    returns = compute_gae(final_value, rewards, masks, values)
    
    returns   = torch.stack(returns).detach()
    log_probs = torch.stack(log_probs).detach()
    values    = torch.stack(values).detach()
    states    = torch.stack(states)
    actions   = torch.stack(actions)
    advantage = returns# - values
    print("log probs {} values {}returns {} advantage {}".format(log_probs.size(), values.size(), returns.size(), advantage.size()))
    
    ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)

    torch.save(model.state_dict(), "run/weights")

    if game_idx % 1 == 0:
        print("Completed game {}/{}, total reward = {}".format(game_idx + 1, N_GAMES, total_reward))
        # test_reward = np.mean([test_env() for _ in range(10)])
        # test_rewards.append(test_reward)
        # plot(frame_idx, test_rewards)
        # if test_reward > threshold_reward: early_stop = True
            
Esempio n. 24
0
def run_acer(variant):
    # BLAS setup
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'

    # Setup
    # args = parser.parse_args()
    # Creating directories.
    save_dir = os.path.join('results', 'results')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    print(' ' * 26 + 'Options')
    """
  # Saving parameters
  with open(os.path.join(save_dir, 'params.txt'), 'w') as f:
    for k, v in vars(args).items():
      print(' ' * 26 + k + ': ' + str(v))
      f.write(k + ' : ' + str(v) + '\n')
  """
    # args.env = 'CartPole-v1'  # TODO: Remove hardcoded environment when code is more adaptable
    # mp.set_start_method(platform.python_version()[0] == '3' and 'spawn' or 'fork')  # Force true spawning (not forking) if available
    torch.manual_seed(variant['seed'])
    T = Counter()  # Global shared counter
    # gym.logger.set_level(gym.logger.ERROR)  # Disable Gym warnings

    # Create shared network
    env = gym.make(variant['env'])
    shared_model = ActorCritic(env.observation_space, env.action_space,
                               variant['hidden_size'])
    shared_model.share_memory()
    """
  if args.model and os.path.isfile(args.model):
    # Load pretrained weights
    shared_model.load_state_dict(torch.load(args.model))
  """
    # Create average network
    shared_average_model = ActorCritic(env.observation_space, env.action_space,
                                       variant['hidden_size'])
    shared_average_model.load_state_dict(shared_model.state_dict())
    shared_average_model.share_memory()
    for param in shared_average_model.parameters():
        param.requires_grad = False
    # Create optimiser for shared network parameters with shared statistics
    optimiser = SharedRMSprop(shared_model.parameters(),
                              lr=variant['lr'],
                              alpha=0.99)
    optimiser.share_memory()
    env.close()

    fields = ['t', 'rewards', 'avg_steps', 'time']
    with open(os.path.join(save_dir, 'test_results.csv'), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(fields)
    # Start validation agent
    processes = []
    p = mp.Process(target=test, args=(0, variant, T, shared_model))
    p.start()
    processes.append(p)

    if not variant['evaluate']:
        # Start training agents
        for rank in range(1, variant['num-processes'] + 1):
            p = mp.Process(target=train,
                           args=(rank, variant, T, shared_model,
                                 shared_average_model, optimiser))
            p.start()
            print('Process ' + str(rank) + ' started')
            processes.append(p)

    # Clean up
    for p in processes:
        p.join()
Esempio n. 25
0
File: test.py Progetto: Luo1996/ACER
def test(rank, args, T, shared_model):
  torch.manual_seed(args.seed + rank)

  env = gym.make(args.env)
  env.seed(args.seed + rank)
  model = ActorCritic(env.observation_space, env.action_space, args.hidden_size)
  model.eval()

  can_test = True  # Test flag
  t_start = 1  # Test step counter to check against global counter
  rewards, steps = [], []  # Rewards and steps for plotting
  l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
  done = True  # Start new episode

  while T.value() <= args.T_max:
    if can_test:
      t_start = T.value()  # Reset counter

      # Evaluate over several episodes and average results
      avg_rewards, avg_episode_lengths = [], []
      for _ in range(args.evaluation_episodes):
        while True:
          # Reset or pass on hidden state
          if done:
            # Sync with shared model every episode
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            cx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            done, episode_length = False, 0
            reward_sum = 0

          # Optionally render validation states
          if args.render:
            env.render()

          # Calculate policy
          policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach()))  # Break graph for memory efficiency

          # Choose action greedily
          action = policy.max(1)[1].data[0, 0]

          # Step
          state, reward, done, _ = env.step(action)
          state = state_to_tensor(state)
          reward_sum += reward
          done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
          episode_length += 1  # Increase episode counter

          # Log and reset statistics at the end of every episode
          if done:
            avg_rewards.append(reward_sum)
            avg_episode_lengths.append(episode_length)
            break

      print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
            datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
            t_start,
            sum(avg_rewards) / args.evaluation_episodes,
            sum(avg_episode_lengths) / args.evaluation_episodes))

      if args.evaluate:
        return

      rewards.append(avg_rewards)  # Keep all evaluations
      steps.append(t_start)
      plot_line(steps, rewards)  # Plot rewards
      torch.save(model.state_dict(), 'model.pth')  # Save model params
      can_test = False  # Finish testing
    else:
      if T.value() - t_start >= args.evaluation_interval:
        can_test = True

    time.sleep(0.001)  # Check if available to test every millisecond

  env.close()
Esempio n. 26
0
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)
    env = env_wrapper.create_doom(args.record, outdir=args.outdir)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=2100)
    episode_length = 0
    result = []

    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)), icm=False)

        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        state = torch.from_numpy(state)

        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            end_time = time.time()
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(end_time - start_time)), reward_sum,
                episode_length))
            result.append((reward_sum, end_time - start_time))
            f = open('output/result.pickle', 'w')
            pickle.dump(result, f)
            f.close()
            torch.save(model.state_dict(), 'output/{}.pth'.format(
                (end_time - start_time)))

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            state = torch.from_numpy(state)
            time.sleep(60)
Esempio n. 27
0
        returns = torch.cat(returns).detach()
        log_probs = torch.cat(log_probs).detach()
        values = torch.cat(values).detach()
        states = torch.cat(states)
        actions = torch.cat(actions)
        advantage = returns - values
        advantage = normalize(advantage)

        ppo_update(frame_idx, states, actions, log_probs, returns, advantage)
        train_epoch += 1

        if train_epoch % TEST_EPOCHS == 0:
            test_reward = np.mean([
                test_env(env, model, device, num_outputs)
                for _ in range(NUM_TESTS)
            ])
            writer.add_scalar("test_rewards", test_reward, frame_idx)
            print('Frame %s. reward: %s' % (frame_idx, test_reward))
            # Save a checkpoint every time we achieve a best reward
            if best_reward is None or best_reward < test_reward:
                if best_reward is not None:
                    print("Best reward updated: %.3f -> %.3f" %
                          (best_reward, test_reward))
                    name = "%s_best_%+.3f_%d.weights" % (
                        "connectx", test_reward, frame_idx)
                    fname = os.path.join('.', 'checkpoints', name)
                    torch.save(model.state_dict(), fname)
                best_reward = test_reward
            if test_reward > TARGET_REWARD: early_stop = True
Esempio n. 28
0
    args.env = 'CartPole-v1'  # TODO: Remove hardcoded environment when code is more adaptable
    torch.manual_seed(args.seed)
    T = Counter()  # Global shared counter

    # Create shared network
    env = gym.make(args.env)
    shared_model = ActorCritic(env.observation_space, env.action_space,
                               args.hidden_size)
    shared_model.share_memory()
    if args.model and os.path.isfile(args.model):
        # Load pretrained weights
        shared_model.load_state_dict(torch.load(args.model))
    # Create average network
    shared_average_model = ActorCritic(env.observation_space, env.action_space,
                                       args.hidden_size)
    shared_average_model.load_state_dict(shared_model.state_dict())
    shared_average_model.share_memory()
    for param in shared_average_model.parameters():
        param.requires_grad = False
    # Create optimiser for shared network parameters with shared statistics
    optimiser = SharedRMSprop(shared_model.parameters(),
                              lr=args.lr,
                              alpha=args.rmsprop_decay)
    optimiser.share_memory()
    env.close()

    # Start validation agent
    processes = []
    p = mp.Process(target=test, args=(0, args, T, shared_model))
    p.start()
    processes.append(p)
Esempio n. 29
0
def test(args, shared_model):
    action_map = _set_action_map()

    env = FixedEnvWrap()

    # time.sleep(10)
    model = ActorCritic()
    model.load_state_dict(shared_model.state_dict())
    model.eval()

    state = env.reset()

    training_time = 0
    vis = visdom.Visdom(env='final')
    line_plot = vis.line(Y=np.array([0]),
                         opts=dict(xlabel='testing count',
                                   ylabel='average reward',
                                   title='ali-v1'))

    start = time.time()
    vis_count = 0
    while True:
        video_count = 1
        reward_all_sum = 0
        reward_all = 0
        reward_all_ave = 0
        reward_gop = 0
        action = 3
        last_action = 3
        # update model before testing all trace files
        # time.sleep(5)
        print('load updated model')
        model.load_state_dict(shared_model.state_dict())
        while True:
            # get the reward for one gop
            while True:
                _, done, decision_flag = env.step_gop(action)
                if decision_flag or done:
                    reward_gop = env.get_reward_gop()
                    state = env.get_state_gop()
                    break
                else:
                    continue
            # print('testing')
            # get action from model
            last_action = action
            with torch.no_grad():
                state = torch.FloatTensor(state)
                logit, _ = model(
                    state.view(-1, args.s_gop_info, args.s_gop_len))
                prob = F.softmax(logit, dim=1)
                _, action = torch.max(prob, 1)
                action = action.data.numpy()[0]

            bitrate, target_buffer = action_map[last_action]
            # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop))
            if done:
                print("video count %d, reward is %.5f" %
                      (video_count, reward_all))
                # reward_all_sum += reward_all / 100
                reward_all_sum += reward_all
                video_count += 1
                if reward_all < 0:
                    print('bad model ! just break this loop')
                    reward_all_ave = 0
                    break
                if video_count > env.traces_len * 2:
                    reward_all_ave = reward_all_sum / video_count
                    break
                action = 3
                last_action = 3
                reward_all = 0

            reward_all += reward_gop

        # update the figure of average reward of all testing files
        vis_count += 1
        reward_all_ave = max(reward_all_ave, 0)
        vis.line(Y=np.array([reward_all_ave]),
                 X=np.array([vis_count]),
                 win=line_plot,
                 update='append')
        path = 'ali-v1/actor.pt-' + str(vis_count)
        torch.save(model.state_dict(), path)

        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)

        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                              seconds))
        print("average reward of traces are: ", reward_all_ave)
        print('saved one model in epoch:', vis_count)
Esempio n. 30
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = JacoEnv(args.width,
                  args.height,
                  args.frame_skip,
                  args.rewarding_distance,
                  args.control_magnitude,
                  args.reward_continuous)
    env.seed(args.seed + rank)
    if args.render:
        (_, _, obs_rgb_view2) = env.reset()
        plt.ion()
        f, ax = plt.subplots()
        im = ax.imshow(obs_rgb_view2)

    model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size)
    model.eval()
    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    n_digits = str(
        len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        cx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        action, reward, done, episode_length = (0, 0, 0, 0, 0,
                                                                0), 0, False, 0
                        reward_sum = 0

                    # Calculate policy
                    policy, _, (hx, cx) = model(
                        Variable(
                            state[0], volatile=True),
                        Variable(
                            state[1], volatile=True),
                        (hx.detach(),
                         cx.detach()))  # Break graph for memory efficiency

                    # Choose action greedily
                    action = [p.max(1)[1].data[0, 0] for p in policy]

                    # Step
                    state, reward, done = env.step(action)
                    obs_rgb_view1 = state[1]
                    obs_rgb_view2 = state[2]
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Optionally render validation states
                    if args.render:
                        # rendering the first camera view
                        im.set_data(obs_rgb_view1)
                        plt.draw()
                        plt.pause(0.05)

                        # rendering mujoco simulation
                        # viewer = mujoco_py.MjViewer(env.sim)
                        # viewer.render()

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break

            print(('[{}] Step: {:<' + n_digits +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime(
                           '%Y-%m-%d %H:%M:%S,%f')[:-3], t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))

            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards)  # Plot rewards
            torch.save(model.state_dict(),
                       os.path.join('results', str(t_start) +
                                    '_model.pth'))  # Checkpoint model params
            can_test = False  # Finish testing
            if args.evaluate:
                return
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond