Beispiel #1
0
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'):
    
    #Minigrid maze env
    env_name = "MiniGrid-BlockMaze-v0"
    def make_env(env_name):
        return lambda: gym_minigrid.wrappers.PadImgObsWrapper(gym.make(env_name))

    envs = [make_env(env_name) for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    with tf.Session() as sess:
        actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space,
                ac_space, policy, summarize)
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc)

        dones = [False for _ in range(N_ENVS)]
        nbatch = N_ENVS * N_STEPS

        episode_rewards = np.zeros((N_ENVS, ))
        final_rewards   = np.zeros((N_ENVS, ))

        for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)):
            # mb stands for mini batch
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
            for n in range(N_STEPS):
                actions, values, _ = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, _ = envs.step(actions)

                #print(obs[0:3, :,:,0])

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)

            mb_dones.append(dones)

            #batch of steps to batch of rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            #discount/bootstrap off value fn
            for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, GAMMA)
                mb_rewards[n] = rewards

            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()

            if summarize:
                loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update,
                        summary_op)
                writer.add_summary(summary, update)
            else:
                loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update)

            if update % LOG_INTERVAL == 0 or update == 1:
                print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy))
                print(final_rewards.mean())

            if update % SAVE_INTERVAL == 0:
                print('Saving model')
                actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt')

        actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
Beispiel #2
0
max_frames = 12000
max_steps = 500
frame_idx = 0
rewards = []
batch_size = 128

while frame_idx < max_frames:
    state = envs.reset()
    ou_noise.reset()
    episode_reward = 0

    for step in range(max_steps):
        action = policy_net.get_action(state)
        action = ou_noise.get_action(action, step)
        # print(action)
        next_state, reward, done, _ = envs.step(action)

        replay_buffer.push(state, action, reward, next_state, done)
        if len(replay_buffer) > batch_size:
            ddpg_update(batch_size)

        state = next_state
        episode_reward += reward
        frame_idx += 1

        if frame_idx % max(1000 * NUM_PROCESS, max_steps + 1) == 0:
            if rewards:
                print(frame_idx, rewards[-1])
            # plot(frame_idx, rewards)
            torch.save(policy_net.state_dict(),
                       "DDPG_original_pendulum_weight.pth")
Beispiel #3
0
    # prevents policy to become exactly 0 or 1 helps exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \
        (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))

    return torch.mean(clipped_surrogate.add(entropy.mul(beta)))


model = ActorCritic().to(device)  #return dist, v
if args.load_weight:
    model.load_state_dict(
        torch.load(f'PongDeterministic-v4_{load_weight_n}.pth'))
optimizer = optim.Adam(model.parameters(), lr=lr)

f1 = envs.reset()
f2 = envs.step([0] * num_envs)

if __name__ == "__main__":
    while not early_stop and frame_idx < max_frames:
        frame_idx += 1
        print(frame_idx)
        if frame_idx % 100 == 0:
            num_steps += args.additional_num_step
        log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories(
            envs, model, num_steps)
        scores = np.asarray(rewards).sum(axis=0)
        scores_list.append(scores.mean())
        print("Mean:", scores.mean(), "\nRaw:", scores)

        # stop if any of the trajectories is done
        # we want all the lists to be retangular
    while frame_idx < max_frames and not early_stop:
        i_update += 1

        values = []
        obs = []
        acs = []
        rewards = []
        masks = []
        entropy = 0

        for current_step in range(num_steps):

            #print("     Current Step: {0}".format(current_step))
            ac = ppo.get_action(ob)
            next_ob, _, done, _ = envs.step(ac)
            reward = [
                discriminator.get_reward(np.concatenate([ob, ac], axis=1))
            ]
            #f.write(str(reward)+'\n')
            #print(reward)

            value = ppo.get_value(ob)
            values.append(value)
            rewards.append(reward)  #[:, np.newaxis])
            masks.append((1 - done))  #[:, np.newaxis])

            obs.append(ob)
            acs.append(ac)

            ob = next_ob
Beispiel #5
0
            
            # actor1 acts in all parallel envs
            action_p1 = agent1.act(make_cuda(state)).squeeze(1).cpu().numpy() 
            
            # actor2 acts in all parallel envs
            action_p2 = agent2.act(make_cuda(state)).squeeze(1).cpu().numpy() 

            # separate actions
            action_tuples = []              
            for i in range(num_envs):
                actions = []
                actions.append(action_p1[i])    # player1
                actions.append(action_p2[i])    # player2
                action_tuples.append(actions)
            
            next_observation, reward, finished, _ = envs.step(action_tuples)    # pass actions to environments

            # separate rewards
            reward1 = []                    
            reward2 = []
            for i in range(num_envs):
                reward1.append(reward[i][player0]) # player1
                reward2.append(reward[i][player1]) # player2

            reward1 = torch.FloatTensor(reward1).unsqueeze(1) # player1
            reward2 = torch.FloatTensor(reward2).unsqueeze(1) # player2
            episode_rewards1 += reward1 # player1
            episode_rewards2 += reward2 # player2

            finished_masks = torch.FloatTensor(1-np.array(finished)).unsqueeze(1)                                                       
Beispiel #6
0
all_rewards = []
all_losses = []

state = envs.reset()
state = torch.FloatTensor(np.float32(state))

rollout.states[0].copy_(state)

episode_rewards = torch.zeros(num_envs, 1)
final_rewards = torch.zeros(num_envs, 1)

for i_update in range(num_batch):

    for step in range(num_steps):
        action = actor_critic.act(Variable(state))
        next_state, reward, done, _ = envs.step(
            action.squeeze(1).cpu().data.numpy())

        reward = torch.FloatTensor(reward).unsqueeze(1)
        episode_rewards += reward
        masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
        final_rewards *= masks
        final_rewards += (1 - masks) * episode_rewards
        episode_rewards *= masks

        if USE_CUDA:
            masks = masks.cuda()

        state = torch.FloatTensor(np.float32(next_state))
        rollout.insert(step, state, action.data, reward, masks)

    _, next_value = actor_critic(
Beispiel #7
0
Datei: main.py Projekt: km01/myrl
    envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)])
    net = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2))
    agent = Model(net, 2).to(device)
    solver = optim.Adam(agent.parameters())
    memory = Memory(replay_memory_capacity)

    eps = 1.0
    duration = []
    frame_count = 0
    lifespan = [[0] for _ in range(num_envs)]
    s_gotten = None
    while frame_count < max_frame:
        s = envs.reset() if s_gotten is None else s_gotten
        preprocessed_s = torch.FloatTensor(s)
        a = agent.response(preprocessed_s, eps)
        s_gotten, r, done, _ = envs.step(a)

        for i in range(num_envs):
            lifespan[i][-1] += 1
            if done[i]:
                if lifespan[i][-1] < 500:
                    r[i] = PENALTY
                    memory.push(s[i], a[i], r[i], s_gotten[i], done[i])
                duration.append(lifespan[i][-1])
                lifespan[i].append(0)

            if lifespan[i][-1] > 0:  # 500일때 버림
                memory.push(s[i], a[i], r[i], s_gotten[i], done[i])

        if frame_count > initial_exploration:
            eps -= 0.00005
Beispiel #8
0
Datei: main.py Projekt: km01/myrl
    obs_gotten = None

    while frame_count < max_frame:

        cache = {'obs': [], 'acts': [], 'rews': [], 'dones': []}
        probs_cache = {'mu': [], 'sig': []}

        for _ in range(n_steps):
            obs = envs.reset() if obs_gotten is None else obs_gotten
            obs_in = torch.FloatTensor(obs).to(device)
            mu, sig = actor(obs_in)
            with torch.no_grad():
                a = Normal(mu, sig).sample()
                a.clamp_(-2.0 + 1e-7, 2.0 - 1e-7)

            obs_gotten, rews, dones, _ = envs.step(a)

            for i in range(num_envs):
                rewards[i][-1] += rews[i]
                if dones[i]:
                    global_rewards.append(rewards[i][-1])
                    rewards[i].append(0.)

            cache['obs'].append(obs)
            cache['acts'].append(a)
            cache['rews'].append(rews * 0.1)
            cache['dones'].append(dones)

            probs_cache['mu'].append(mu)
            probs_cache['sig'].append(sig)
Beispiel #9
0
    log_probs = []
    values = []
    states = []
    actions = []
    rewards = []
    masks = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)
        functional.reset_net(model)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(
            torch.max(action, 1)[1].cpu().numpy())

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

        states.append(state)
        actions.append(action)

        state = next_state
        step_idx += 1
Beispiel #10
0
all_rewards = []
all_losses  = []

state = envs.reset()
state = torch.FloatTensor(np.float32(state))

rollout.states[0].copy_(state)

episode_rewards = torch.zeros(num_envs, 1)
final_rewards   = torch.zeros(num_envs, 1)

for i_update in range(num_batch):

    for step in range(num_steps):
        action = actor_critic.act(Variable(state))
        next_state, reward, done, _ = envs.step(action.squeeze(1).cpu().data.numpy())

        reward = torch.FloatTensor(reward).unsqueeze(1)
        episode_rewards += reward
        masks = torch.FloatTensor(1-np.array(done)).unsqueeze(1)
        final_rewards *= masks
        final_rewards += (1-masks) * episode_rewards
        episode_rewards *= masks

        if USE_CUDA:
            masks = masks.cuda()

        state = torch.FloatTensor(np.float32(next_state))
        rollout.insert(step, state, action.data, reward, masks)

Beispiel #11
0
def train(env_fn=None,
          spectrum=False,
          a2c_arch=None,
          nenvs=16,
          nsteps=100,
          max_iters=1e6,
          gamma=0.99,
          pg_coeff=1.0,
          vf_coeff=0.5,
          ent_coeff=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          alpha=0.99,
          epsilon=1e-5,
          log_interval=100,
          summarize=True,
          load_path=None,
          log_path=None,
          cpu_cores=1):

    # Construct the vectorized parallel environments
    envs = [env_fn for _ in range(nenvs)]
    envs = SubprocVecEnv(envs)

    # Set some random seeds for the environment
    envs.seed(0)
    if spectrum:
        envs.spectrum()

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores,
                               intra_op_parallelism_threads=cpu_cores)
    tf_config.gpu_options.allow_growth = True

    with tf.Session(config=tf_config) as sess:

        actor_critic = ActorCritic(sess, a2c_arch, ob_space, ac_space,
                                   pg_coeff, vf_coeff, ent_coeff,
                                   max_grad_norm, lr, alpha, epsilon,
                                   summarize)

        load_count = 0
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (-1, nw, nh, nc)

        dones = [False for _ in range(nenvs)]

        episode_rewards = np.zeros((nenvs, ))
        final_rewards = np.zeros((nenvs, ))

        print('a2c Training Start!')
        print('Model will be saved on intervals of %i' % (log_interval))
        for i in tqdm(range(load_count + 1,
                            int(max_iters) + 1),
                      ascii=True,
                      desc='ActorCritic'):

            # Create the minibatch lists
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_depth = [], [], [], [], [], []
            total_reward = 0

            for n in range(nsteps):

                # Get the actions and values from the actor critic, we don't need neglogp
                actions, values, neglogp = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, info = envs.step(actions)
                total_reward += np.sum(rewards)

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)
                mb_depth.append(
                    np.array(
                        [info_item['scramble_depth'] for info_item in info]))

            mb_dones.append(dones)

            # Convert batch steps to batch rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(
                1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards,
                                    dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0)
            mb_depth = np.asarray(mb_depth, dtype=np.int32).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            # discounting
            for n, (rewards, d,
                    value) in enumerate(zip(mb_rewards, mb_dones,
                                            last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards + [value], d + [0],
                                                  gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, gamma)
                mb_rewards[n] = rewards

            # Flatten the whole minibatch
            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()
            mb_depth = mb_depth.flatten()

            # Save the information to tensorboard
            if summarize:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _, summary = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i, summary_op)
                writer.add_summary(summary, i)
            else:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _ = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i)

            if i % log_interval == 0:
                actor_critic.save(log_path, i)

        actor_critic.save(log_path, 'final')
        print('a2c model is finished training')
Beispiel #12
0
class Ppo:
    
    def __init__(self, numOfEnvs):
        
        self.testRewards = []
        
#         self.num_envs = 16
#         self.num_envs = numOfEnvs
        self.num_envs = 6
        
        self.env_name = "Pendulum-v0"
        self.env = gym.make(self.env_name)
        
        self.envs = [self.make_env() for i in range(self.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        
        self.num_inputs  = self.envs.observation_space.shape[0]
        self.num_outputs = self.envs.action_space.shape[0]

        #Hyper params:
        self.hidden_size      = 256
        self.lr               = 3e-3

        self.model = ActorCritic(self.num_inputs, self.num_outputs, self.hidden_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

            
    def make_env(self):
        def _thunk():
            env = gym.make(self.env_name)
            return env

        return _thunk        

#     def compute_gae(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    def compute_gae(self, next_value, rewards, masks, values, g, t):
        
        gamma = float(g)
        tau = float(t)

        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
                dist, value = self.model(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

                actor_loss  = - torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        return loss
                
    def plot(self, frame_idx, rewards):
        clear_output(True)
        plt.figure(figsize=(20,5))
        plt.subplot(131)
        plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
        plt.plot(rewards)
        plt.show()
#         plt.savefig("{0}/{1}_rewardGraph.png".format(saveGraphPath, frame_idx))
        
    def test_env(self, vis=False):
        state = self.env.reset()
        if vis: self.env.render()
        done = False
        total_reward = 0
        while not done:
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            dist, _ = self.model(state)
            next_state, reward, done, _ = self.env.step(dist.sample().cpu().numpy()[0])
            state = next_state
            if vis: self.env.render()
            total_reward += reward
        return total_reward
                
    def main(self, inputVals):
        gam = inputVals[0]
        lam = inputVals[1]
        
        print ("Gam: ", gam)
        print ("Lam: ", lam)
        
        num_inputs  = self.envs.observation_space.shape[0]
        num_outputs = self.envs.action_space.shape[0]

        #Hyper params:
#         hidden_size      = 256
#         lr               = 3e-3
        num_steps        = 20
        mini_batch_size  = 5
        ppo_epochs       = 4
        threshold_reward = -200

#         model = a.ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
#         optimizer = optim.Adam(self.model.parameters(), lr=lr)
        
        max_frames = 12000
#         max_frames = 2000
        frame_idx  = 0
        self.test_rewards = []
        
        state = self.envs.reset()
        early_stop = False

        while frame_idx < max_frames and not early_stop:

            log_probs = []
            values    = []
            states    = []
            actions   = []
            rewards   = []
            masks     = []
            entropy = 0

            for _ in range(num_steps):
                state = torch.FloatTensor(state).to(device)
                dist, value = self.model(state)

                action = dist.sample()
                next_state, reward, done, _ = self.envs.step(action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
                masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

                states.append(state)
                actions.append(action)

                state = next_state
                frame_idx += 1

                if frame_idx % 1000 == 0:
                    test_reward = np.mean([self.test_env() for _ in range(10)])
                    self.test_rewards.append(test_reward)
                    self.plot(frame_idx, self.test_rewards)
                    if test_reward > threshold_reward: early_stop = True
                    print ("rewards: ", test_reward)


            next_state = torch.FloatTensor(next_state).to(device)
            _, next_value = self.model(next_state)
            returns = self.compute_gae(next_value, rewards, masks, values, gam, lam)

            returns   = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values    = torch.cat(values).detach()
            states    = torch.cat(states)
            actions   = torch.cat(actions)
            advantage = returns - values

            lastLoss = self.ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
#             print ("loss: ", [lastLoss])
            
#         re = rewards[-1].cpu()
#         print ("RE: ", np.asarray(re))
#         return (np.asarray(re))
        return lastLoss.item()
Beispiel #13
0
    # advanced 학습에 사용할 객체 rollouts에 첫번째 상태로 현재 상태를 저장
    rollouts.observations[0].copy_(current_obs)

     # 주 반복문
    for j in tqdm(range(NUM_UPDATES)):
        # advanced 학습 범위에 들어가는 단계마다 반복
        for step in range(NUM_ADVANCED_STEP):

            # 행동을 결정
            with torch.no_grad():
                action = actor_critic.act(rollouts.observations[step])

            cpu_actions = action.squeeze(1).cpu().numpy()  # tensor를 NumPy 변수로

            # 1단계를 병렬로 실행, 반환값 obs의 크기는 (16, 1, 84, 84)
            obs, reward, done, info = envs.step(cpu_actions)

            # 보상을 텐서로 변환한 다음 에피소드 총 보상에 더함
            # 크기가 (16,)인 것을 (16, 1)로 변환
            reward = np.expand_dims(np.stack(reward), 1)
            reward = torch.from_numpy(reward).float()
            episode_rewards += reward

            # 각 프로세스마다 done이 True이면 0, False이면 1
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])

            # 마지막 에피소드의 총 보상을 업데이트
            final_rewards *= masks  # done이 True이면 0을 곱하고, False이면 1을 곱하여 리셋
            # done이 False이면 0을 더하고, True이면 epicodic_rewards를 더함
            final_rewards += (1 - masks) * episode_rewards
Beispiel #14
0
state = envs.reset()

while frame_idx < max_frames:

    log_probs = []
    values    = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()
        
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            test_rewards.append(np.mean([test_env() for _ in range(10)]))
            # plot(frame_idx, test_rewards)
Beispiel #15
0
def main():
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n

    model = ActorCritic(num_inputs, num_outputs, hidden_size,
                        hd2_size).to(device)
    optimizer = optim.Adam(model.parameters())

    max_frames = 10000
    frame_idx = 0
    test_rewards = []

    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)

            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())

            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values

        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        print(f'\rframe: {frame_idx}\t loss: {loss}', end='')
        if frame_idx % 100 == 0:
            rewards, scores = map(
                list, zip(*((test_env(model, False) for _ in range(10)))))
            avg_rewards = np.mean(rewards)
            avg_scores = np.mean(scores)
            print(
                f'\rframe: {frame_idx}\t avg_rewards: {avg_rewards:.2f}\t avg_scores: {avg_scores:.2f}\t loss: {loss}'
            )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    ((test_env(model, True) for _ in range(10)))
    envs.close()
Beispiel #16
0
    duration = []
    frame_count = 0
    lifespan = [[0] for _ in range(num_envs)]
    s_gotten = None

    while frame_count * n_step < max_frame:
        obs_l, acts_l, rews_l, dones_l, probs_l = [], [], [], [], []
        accept_sample = [True for _ in range(num_envs)]
        for _ in range(n_step):
            obs = envs.reset() if s_gotten is None else s_gotten
            obs_in = torch.FloatTensor(obs).to(device)
            prob = actor(obs_in)

            with torch.no_grad():
                a = prob.multinomial(num_samples=1)
            s_gotten, rews, dones, _ = envs.step(a.view(-1).numpy())

            for i in range(num_envs):
                lifespan[i][-1] += 1
                if dones[i]:
                    if lifespan[i][-1] < 500:
                        rews[i] = PENALTY
                    else:  # 500번째
                        accept_sample[i] = False
                        print(lifespan[i][-1],
                              critic(obs_in[[i], :]).view(-1).item())
                    duration.append(lifespan[i][-1])
                    lifespan[i].append(0)

            obs_l.append(obs)
            acts_l.append(a)
    while not d:
        print('-------------------------------------------------')
        print('Current Observation')
        envs.render(0)
        time.sleep(0.1)

        a, v, neg = actor_critic.act(obs, stochastic=True)
        print('')
        print('action: ', actions[a[0]])
        print('value: ', v)
        print('neglogp: ', neg)
        print('pd: ')
        for ac, pd in zip(actions, actor_critic.step_model.logits(obs)[0][0]):
            print('\t', ac, pd)

        obs, r, d, sbo = envs.step(a)
        print('r: ', r)
        envs.render(0)
        time.sleep(0.1)

        if not d:
            im = plt.imshow(cube_gym.onehotToRGB(obs[0]))
            ims.append([im])
        else:
            print('DONE')
            im = plt.imshow(cube_gym.onehotToRGB(sbo[0]))
            ims.append([im])

        d = d[0]
        print(r)