Esempio n. 1
0
def main():  
    human_model = ActorCritic()
    human_model.load_state_dict(torch.load('ac_para.pkl'))

    env = gym.make('CartPole-v1')
    model = AskActorCritic()    
    print_interval = 20
    score = 0.0

    for n_epi in range(10000):
        done = False
        s = env.reset()
        step,ask_step = 0,0
        while not done:
            for t in range(n_rollout):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
                if a == 2: # human action
                    prob = human_model.pi(torch.from_numpy(s).float())
                    m = Categorical(prob)
                    a = m.sample().item()
                    model.put_human_data((s, a))
                    ask_step += 1
                
                s_prime, r, done, info = env.step(a)

                model.put_data((s,a,r,s_prime,done))
                
                s = s_prime
                score += r
                step += 1
                if done:
                    break                     
            
            model.train_net()
            
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}, ask rate : {:.2f}".format(n_epi, score/print_interval, ask_step/step))
            score = 0.0
Esempio n. 2
0
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, envs.action_space.n, num_pixels, len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())

    env_model = env_model.to(DEVICE)
    actor_critic = actor_critic.to(DEVICE)

    checkpoint = torch.load(os.path.join(ACTOR_CRITIC_PATH, "actor_critic_checkpoint"))
    actor_critic.load_state_dict(checkpoint['actor_critic_state_dict'])


    reward_coef = 0.1
    num_updates = args.epoch

    losses = []
    all_rewards = []

    for frame_idx, states, actions, rewards, next_states, dones in play_games(envs, num_updates, actor_critic):
        states      = torch.FloatTensor(states)
        actions     = torch.LongTensor(actions)

        batch_size = states.size(0)
        
        onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
Esempio n. 3
0
class PPO:
    def __init__(self, device, state_dim, action_dim, action_std, lr, betas,
                 gamma, K_epochs, eps_clip):
        self.lr = lr
        self.device = device
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)

        self.policy_old = ActorCritic(state_dim, action_dim,
                                      action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory):
        if np.any(np.isnan(state)):
            print('in select action: state is nan', state)
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()

    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards),
                                       reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states_ = torch.squeeze(
            torch.stack(memory.states).to(self.device)).detach()
        old_actions_ = torch.squeeze(
            torch.stack(memory.actions).to(self.device)).detach()
        old_logprobs_ = torch.squeeze(torch.stack(memory.logprobs)).to(
            self.device).detach()

        batch_size = old_states_.shape[0]
        mini_batch_size = batch_size // 8  # 64

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            for i in range(batch_size // mini_batch_size):
                rand_ids = np.random.randint(0, batch_size, mini_batch_size)
                old_states = old_states_[rand_ids, :]
                old_actions = old_actions_[rand_ids, :]
                old_logprobs = old_logprobs_[rand_ids, :]
                rewards_batch = rewards[rand_ids]

                logprobs, state_values, dist_entropy = self.policy.evaluate(
                    old_states, old_actions)

                # Finding the ratio (pi_theta / pi_theta__old):
                ratios = torch.exp(logprobs - old_logprobs.detach())

                # Finding Surrogate Loss:
                advantages = rewards_batch - state_values.detach()
                ## torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip)
                #surr = -torch.min(ratios, 1) * advantages  # as per the paper

                len_adv = advantages.shape[0]
                advantages = advantages.reshape((len_adv, 1))
                surr1 = ratios * advantages
                surr2 = 1 * advantages  ## as per the paper

                surr = -torch.min(surr1, surr2).mean()
                w_crit_loss = 1
                loss = surr + w_crit_loss * (rewards_batch - state_values).pow(
                    2).mean()  #- 0.01 * dist_entropy

                # take gradient step
                self.optimizer.zero_grad()
                loss.mean().backward()
                self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
Esempio n. 4
0
class PPO(nn.Module):
    def __init__(self,
                 state_dim,
                 action_dim,
                 eps=0.2,
                 gamma=0.99,
                 lambda_=0.95,
                 K_epoch=80,
                 batch_size=64):
        super(PPO, self).__init__()
        self.eps = eps
        self.gamma = gamma
        self.lambda_ = lambda_
        self.K_epoch = K_epoch
        self.batch_size = batch_size

        self.model = ActorCritic(state_dim, action_dim)
        self.model_old = ActorCritic(state_dim, action_dim)
        for param in self.model_old.parameters():
            param.requires_grad = False
        self.copy_weights()

    def forward(self, x):
        self.pi, self.v = self.model_old(x)

        return self.pi, self.v

    def copy_weights(self):
        self.model_old.load_state_dict(self.model.state_dict())

    def update(self, buffer, optimizer):
        self.model.train()
        self.model_old.eval()
        self.advantage_fcn(buffer.data)

        batch_loss, batch_clip_loss, batch_vf_loss = [], [], []
        for epoch in range(self.K_epoch):
            for state, action, next_s, reward, log_prob_old, entropy, advantage in buffer.get_data(
                    self.batch_size):
                pi, v = self.model(state)
                log_prob_pi = pi.log_prob(action)

                prob_ratio = torch.exp(log_prob_pi - log_prob_old)

                first_term = prob_ratio * advantage
                second_term = self.clip_by_value(prob_ratio) * advantage
                loss_clip = (torch.min(first_term, second_term)).mean()

                _, v_next = self.model_old(next_s)
                v_target = reward + self.gamma * v_next
                loss_vf = ((v - v_target)**2).mean(
                )  # squared error loss: (v(s_t) - v_target)**2

                loss = -(loss_clip - loss_vf
                         )  #-(loss_clip - 0.5*loss_vf + 0.01*entropy.mean())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                batch_loss.append(loss.detach().numpy())
                batch_clip_loss.append(loss_clip.detach().numpy())
                batch_vf_loss.append(loss_vf.detach().numpy())

        self.copy_weights()
        buffer.reset()

    def advantage_fcn(self, buffer, normalize=True):
        _, v_st1 = self.model(torch.stack(buffer['next_s']))
        _, v_s = self.model(torch.stack(buffer['s']))
        deltas = torch.stack(buffer['r']) + self.gamma * v_st1 - v_s

        advantage, temp = [], 0
        idxs = torch.tensor(range(len(deltas) - 1, -1, -1))  #reverse
        reverse_deltas = deltas.index_select(0, idxs)
        for delta_t in reverse_deltas:
            temp = delta_t + self.lambda_ * self.gamma * temp
            advantage.append(temp)

        advantage = torch.as_tensor(advantage[::-1])  #re-reverse
        if normalize:
            advantage = (advantage - advantage.mean()) / advantage.std()

        buffer['advantage'] = advantage.unsqueeze(1)

    def clip_by_value(self, x):
        return x.clamp(1 - self.eps, 1 + self.eps)  # clamp(min, max)
Esempio n. 5
0
              action_size=g_action_size,
              shared_layers=[128, 64],
              critic_hidden_layers=[],
              actor_hidden_layers=[],
              init_type='xavier-uniform',
              seed=0).to(g_device)
    saved_model = 'ppo_128x64_a0_c0_470e.pth'
    """
    policy = ActorCritic(state_size=g_state_size,
                         action_size=g_action_size,
                         shared_layers=[128, 128],
                         critic_hidden_layers=[64],
                         actor_hidden_layers=[64],
                         init_type='xavier-uniform',
                         seed=0).to(g_device)
    saved_model = 'ppo_128x128_a64_c64_193e.pth'

    # load the model
    policy.load_state_dict(torch.load(saved_model))

    # evaluate the model
    for e in range(episode):
        rewards = eval_policy(envs=g_env, policy=policy, tmax=1000)
        total_rewards = np.sum(rewards, 0)
        scores_window.append(total_rewards.mean())
        print("Episode: {0:d}, score: {1}".format(e + 1,
                                                  np.mean(scores_window)),
              end="\n")

    g_env.close()
Esempio n. 6
0
def worker(name, input_shape, n_actions, global_agent, global_icm,
           optimizer, icm_optimizer, env_id, n_threads, icm=False):
    T_MAX = 20

    local_agent = ActorCritic(input_shape, n_actions)

    if icm:
        local_icm = ICM(input_shape, n_actions)
        algo = 'ICM'
    else:
        intrinsic_reward = T.zeros(1)
        algo = 'A3C'

    memory = Memory()

    env = gym.make(env_id)

    t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0

    while episode < max_eps:
        obs = env.reset()
        hx = T.zeros(1, 256)
        score, done, ep_steps = 0, False, 0
        while not done:
            state = T.tensor([obs], dtype=T.float)
            action, value, log_prob, hx = local_agent(state, hx)
            obs_, reward, done, info = env.step(action)
            t_steps += 1
            ep_steps += 1
            score += reward
            reward = 0  # turn off extrinsic rewards
            memory.remember(obs, action, reward, obs_, value, log_prob)
            obs = obs_
            if ep_steps % T_MAX == 0 or done:
                states, actions, rewards, new_states, values, log_probs = \
                        memory.sample_memory()
                if icm:
                    intrinsic_reward, L_I, L_F = \
                            local_icm.calc_loss(states, new_states, actions)

                loss = local_agent.calc_loss(obs, hx, done, rewards, values,
                                             log_probs, intrinsic_reward)

                optimizer.zero_grad()
                hx = hx.detach_()
                if icm:
                    icm_optimizer.zero_grad()
                    (L_I + L_F).backward()

                loss.backward()
                T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40)

                for local_param, global_param in zip(
                                        local_agent.parameters(),
                                        global_agent.parameters()):
                    global_param._grad = local_param.grad
                optimizer.step()
                local_agent.load_state_dict(global_agent.state_dict())

                if icm:
                    for local_param, global_param in zip(
                                            local_icm.parameters(),
                                            global_icm.parameters()):
                        global_param._grad = local_param.grad
                    icm_optimizer.step()
                    local_icm.load_state_dict(global_icm.state_dict())
                memory.clear_memory()

        if name == '1':
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} '
                  'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format(
                      algo, episode, name, n_threads,
                      t_steps/1e6, score,
                      T.sum(intrinsic_reward),
                      avg_score))
        episode += 1
    if name == '1':
        x = [z for z in range(episode)]
        fname = algo + '_CartPole_no_rewards.png'
        plot_learning_curve(x, scores, fname)
Esempio n. 7
0
def main():
    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, num_pixels,
                         len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())

    actor_critic.load_state_dict(torch.load("actor_critic_" + mode))

    def get_action(state):
        if state.ndim == 4:
            state = torch.FloatTensor(np.float32(state))
        else:
            state = torch.FloatTensor(np.float32(state)).unsqueeze(0)

        action = actor_critic.act(autograd.Variable(state, volatile=True))
        action = action.data.cpu().squeeze(1).numpy()
        return action

    def play_games(envs, frames):
        states = envs.reset()

        for frame_idx in range(frames):
            actions = get_action(states)
            next_states, rewards, dones, _ = envs.step(actions)

            yield frame_idx, states, actions, rewards, next_states, dones

            states = next_states

    reward_coef = 0.1
    num_updates = 5000

    losses = []
    all_rewards = []

    for frame_idx, states, actions, rewards, next_states, dones in tqdm(
            play_games(envs, num_updates), total=num_updates):
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)

        batch_size = states.size(0)

        onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
        onehot_actions[range(batch_size), actions] = 1
        inputs = autograd.Variable(torch.cat([states, onehot_actions], 1))

        #if USE_CUDA:
        #    inputs = inputs.cuda()

        imagined_state, imagined_reward = env_model(inputs)

        target_state = pix_to_target(next_states)
        target_state = autograd.Variable(torch.LongTensor(target_state))

        target_reward = rewards_to_target(mode, rewards)
        target_reward = autograd.Variable(torch.LongTensor(target_reward))

        optimizer.zero_grad()
        image_loss = criterion(imagined_state, target_state)
        reward_loss = criterion(imagined_reward, target_reward)
        loss = image_loss + reward_coef * reward_loss
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        all_rewards.append(np.mean(rewards))

        if frame_idx % num_updates == 0:
            plot(frame_idx, all_rewards, losses)

    torch.save(env_model.state_dict(), "env_model_" + mode)

    import time

    env = MiniPacman(mode, 1000)
    batch_size = 1

    done = False
    state = env.reset()
    iss = []
    ss = []

    steps = 0

    while not done:
        steps += 1
        actions = get_action(state)
        onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
        onehot_actions[range(batch_size), actions] = 1
        state = torch.FloatTensor(state).unsqueeze(0)

        inputs = autograd.Variable(torch.cat([state, onehot_actions], 1))
        #if USE_CUDA:
        #    inputs = inputs.cuda()

        imagined_state, imagined_reward = env_model(inputs)
        imagined_state = F.softmax(imagined_state)
        iss.append(imagined_state)

        next_state, reward, done, _ = env.step(actions[0])
        ss.append(state)
        state = next_state

        imagined_image = target_to_pix(
            imagined_state.view(batch_size, -1,
                                len(pixels))[0].max(1)[1].data.cpu().numpy())
        imagined_image = imagined_image.reshape(15, 19, 3)
        state_image = torch.FloatTensor(next_state).permute(1, 2,
                                                            0).cpu().numpy()

        #clear_output()
        plt.figure(figsize=(10, 3))
        plt.subplot(131)
        plt.title("Imagined")
        plt.imshow(imagined_image)
        plt.subplot(132)
        plt.title("Actual")
        plt.imshow(state_image)
        plt.show()
        time.sleep(0.3)

        if steps > 30:
            break