def main():
    learning_rate = 0.001
    discount = 0.995
    beta = 0.4
    eps = 0.05
    K_epoch = 3
    num_steps = 128

    envs = [make_env() for _ in range(num_envs)]
    envs = SubprocVecEnv(envs)
    model = CNNTradingAgent(num_features=envs.reset().shape[-1],
                            n_actions=2 * n_action_intervals + 1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print_interval = 10

    scores_list = []
    loss_list = []
    for n_epi in range(10000):  # 게임 1만판 진행
        n_epi += 1
        loss = 0.0
        log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories(
            envs, model, num_steps)

        # raise Exception("True" if torch.any(torch.isnan(torch.stack(states))) else "False")
        if beta > 0.01:
            beta *= discount
        for _ in range(K_epoch):
            L = -clipped_surrogate(envs, model, log_probs, states, actions,
                                   rewards, discount, eps, beta)

            optimizer.zero_grad()
            L.backward()
            optimizer.step()

            loss += L.item()
            del L

        score = np.asarray(rewards).sum(axis=0).mean()
        scores_list.append(score)
        loss_list.append(loss)

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.4f}, loss : {:.6f}".format(
                n_epi, score / print_interval, loss / print_interval))
            print("actions : ", torch.cat(actions))

        if n_epi % save_interval == 0:
            torch.save(model.state_dict(),
                       os.path.join(save_location, f'TradingGym_{n_epi}.pth'))
            torch.save(scores_list,
                       os.path.join(save_location, f"{n_epi}_scores.pth"))
            # plt.plot(scores_list)
            # plt.title("Reward")
            # plt.grid(True)
            # plt.savefig(os.path.join(save_location,f'{n_epi}_ppo.png'))
            # plt.close()

    del envs
Beispiel #2
0
    net = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2))
    actor = Actor(4, 128, 2).to(device)
    critic = Critic(4, 128).to(device)
    solver = optim.Adam(
        list(actor.parameters()) + list(critic.parameters()), lr)

    duration = []
    frame_count = 0
    lifespan = [[0] for _ in range(num_envs)]
    s_gotten = None

    while frame_count * n_step < max_frame:
        obs_l, acts_l, rews_l, dones_l, probs_l = [], [], [], [], []
        accept_sample = [True for _ in range(num_envs)]
        for _ in range(n_step):
            obs = envs.reset() if s_gotten is None else s_gotten
            obs_in = torch.FloatTensor(obs).to(device)
            prob = actor(obs_in)

            with torch.no_grad():
                a = prob.multinomial(num_samples=1)
            s_gotten, rews, dones, _ = envs.step(a.view(-1).numpy())

            for i in range(num_envs):
                lifespan[i][-1] += 1
                if dones[i]:
                    if lifespan[i][-1] < 500:
                        rews[i] = PENALTY
                    else:  # 500번째
                        accept_sample[i] = False
                        print(lifespan[i][-1],
Beispiel #3
0
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

# Hyper params:
hidden_size = 256
lr          = 3e-2
num_steps   = 20

model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters())

max_frames   = 100000
frame_idx    = 0
test_rewards = []

state = envs.reset()

while frame_idx < max_frames:

    log_probs = []
    values    = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())
Beispiel #4
0
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'):
    
    #Minigrid maze env
    env_name = "MiniGrid-BlockMaze-v0"
    def make_env(env_name):
        return lambda: gym_minigrid.wrappers.PadImgObsWrapper(gym.make(env_name))

    envs = [make_env(env_name) for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    with tf.Session() as sess:
        actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space,
                ac_space, policy, summarize)
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc)

        dones = [False for _ in range(N_ENVS)]
        nbatch = N_ENVS * N_STEPS

        episode_rewards = np.zeros((N_ENVS, ))
        final_rewards   = np.zeros((N_ENVS, ))

        for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)):
            # mb stands for mini batch
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
            for n in range(N_STEPS):
                actions, values, _ = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, _ = envs.step(actions)

                #print(obs[0:3, :,:,0])

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)

            mb_dones.append(dones)

            #batch of steps to batch of rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            #discount/bootstrap off value fn
            for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, GAMMA)
                mb_rewards[n] = rewards

            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()

            if summarize:
                loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update,
                        summary_op)
                writer.add_summary(summary, update)
            else:
                loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update)

            if update % LOG_INTERVAL == 0 or update == 1:
                print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy))
                print(final_rewards.mean())

            if update % SAVE_INTERVAL == 0:
                print('Saving model')
                actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt')

        actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
Beispiel #5
0
    agent2 = make_cuda(agent2)

    rollout1 = RolloutStorage(num_steps, num_envs, state_shape)
    rollout2 = RolloutStorage(num_steps, num_envs, state_shape)

    if USE_CUDA:        
        rollout1.cuda()
        rollout2.cuda()

    all_rewards1 = []
    all_losses1  = []

    all_rewards2 = []
    all_losses2  = []

    observations = envs.reset() 

    state = featurize(observations)
    state = make_cuda(torch.FloatTensor(state))

    rollout1.states[0].copy_(state)
    rollout2.states[0].copy_(state)

    episode_rewards1 = torch.zeros(num_envs, 1)
    final_rewards1   = torch.zeros(num_envs, 1)    

    episode_rewards2 = torch.zeros(num_envs, 1)
    final_rewards2   = torch.zeros(num_envs, 1)    

    timer.update(time.time())
    swich_variable = 0
Beispiel #6
0
    # this steers new_policy towards 0.5
    # prevents policy to become exactly 0 or 1 helps exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \
        (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))

    return torch.mean(clipped_surrogate.add(entropy.mul(beta)))


model = ActorCritic().to(device)  #return dist, v
if args.load_weight:
    model.load_state_dict(
        torch.load(f'PongDeterministic-v4_{load_weight_n}.pth'))
optimizer = optim.Adam(model.parameters(), lr=lr)

f1 = envs.reset()
f2 = envs.step([0] * num_envs)

if __name__ == "__main__":
    while not early_stop and frame_idx < max_frames:
        frame_idx += 1
        print(frame_idx)
        if frame_idx % 100 == 0:
            num_steps += args.additional_num_step
        log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories(
            envs, model, num_steps)
        scores = np.asarray(rewards).sum(axis=0)
        scores_list.append(scores.mean())
        print("Mean:", scores.mean(), "\nRaw:", scores)

        # stop if any of the trajectories is done
Beispiel #7
0
alpha = 0.99

#Init a2c and rmsprop
actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n)
optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)
    
if USE_CUDA:
    actor_critic = actor_critic.cuda()

rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
# rollout.cuda()

all_rewards = []
all_losses  = []

state = envs.reset()
state = torch.FloatTensor(np.float32(state))

rollout.states[0].copy_(state)

episode_rewards = torch.zeros(num_envs, 1)
final_rewards   = torch.zeros(num_envs, 1)

for i_update in range(num_batch):

    for step in range(num_steps):
        action = actor_critic.act(Variable(state))
        next_state, reward, done, _ = envs.step(action.squeeze(1).cpu().data.numpy())

        reward = torch.FloatTensor(reward).unsqueeze(1)
        episode_rewards += reward
Beispiel #8
0
def main():
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n

    model = ActorCritic(num_inputs, num_outputs, hidden_size,
                        hd2_size).to(device)
    optimizer = optim.Adam(model.parameters())

    max_frames = 10000
    frame_idx = 0
    test_rewards = []

    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)

            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())

            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values

        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        print(f'\rframe: {frame_idx}\t loss: {loss}', end='')
        if frame_idx % 100 == 0:
            rewards, scores = map(
                list, zip(*((test_env(model, False) for _ in range(10)))))
            avg_rewards = np.mean(rewards)
            avg_scores = np.mean(scores)
            print(
                f'\rframe: {frame_idx}\t avg_rewards: {avg_rewards:.2f}\t avg_scores: {avg_scores:.2f}\t loss: {loss}'
            )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    ((test_env(model, True) for _ in range(10)))
    envs.close()
Beispiel #9
0
def train(env_fn=None,
          spectrum=False,
          a2c_arch=None,
          nenvs=16,
          nsteps=100,
          max_iters=1e6,
          gamma=0.99,
          pg_coeff=1.0,
          vf_coeff=0.5,
          ent_coeff=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          alpha=0.99,
          epsilon=1e-5,
          log_interval=100,
          summarize=True,
          load_path=None,
          log_path=None,
          cpu_cores=1):

    # Construct the vectorized parallel environments
    envs = [env_fn for _ in range(nenvs)]
    envs = SubprocVecEnv(envs)

    # Set some random seeds for the environment
    envs.seed(0)
    if spectrum:
        envs.spectrum()

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores,
                               intra_op_parallelism_threads=cpu_cores)
    tf_config.gpu_options.allow_growth = True

    with tf.Session(config=tf_config) as sess:

        actor_critic = ActorCritic(sess, a2c_arch, ob_space, ac_space,
                                   pg_coeff, vf_coeff, ent_coeff,
                                   max_grad_norm, lr, alpha, epsilon,
                                   summarize)

        load_count = 0
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (-1, nw, nh, nc)

        dones = [False for _ in range(nenvs)]

        episode_rewards = np.zeros((nenvs, ))
        final_rewards = np.zeros((nenvs, ))

        print('a2c Training Start!')
        print('Model will be saved on intervals of %i' % (log_interval))
        for i in tqdm(range(load_count + 1,
                            int(max_iters) + 1),
                      ascii=True,
                      desc='ActorCritic'):

            # Create the minibatch lists
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_depth = [], [], [], [], [], []
            total_reward = 0

            for n in range(nsteps):

                # Get the actions and values from the actor critic, we don't need neglogp
                actions, values, neglogp = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, info = envs.step(actions)
                total_reward += np.sum(rewards)

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)
                mb_depth.append(
                    np.array(
                        [info_item['scramble_depth'] for info_item in info]))

            mb_dones.append(dones)

            # Convert batch steps to batch rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(
                1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards,
                                    dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0)
            mb_depth = np.asarray(mb_depth, dtype=np.int32).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            # discounting
            for n, (rewards, d,
                    value) in enumerate(zip(mb_rewards, mb_dones,
                                            last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards + [value], d + [0],
                                                  gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, gamma)
                mb_rewards[n] = rewards

            # Flatten the whole minibatch
            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()
            mb_depth = mb_depth.flatten()

            # Save the information to tensorboard
            if summarize:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _, summary = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i, summary_op)
                writer.add_summary(summary, i)
            else:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _ = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i)

            if i % log_interval == 0:
                actor_critic.save(log_path, i)

        actor_critic.save(log_path, 'final')
        print('a2c model is finished training')
def train(env_fn=None,
          spectrum=False,
          vae_arch=None,
          a2c_arch=None,
          nenvs=16,
          nsteps=100,
          max_iters=1e6,
          kl_coeff=0.5,
          lr=7e-4,
          log_interval=100,
          summarize=True,
          vae_load_path=None,
          a2c_load_path=None,
          log_path=None,
          cpu_cores=1):

    # Construct the vectorized parallel environments
    envs = [env_fn for _ in range(nenvs)]
    envs = SubprocVecEnv(envs)

    # Set some random seeds for the environment
    envs.seed(0)
    if spectrum:
        envs.spectrum()

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores,
                               intra_op_parallelism_threads=cpu_cores)
    tf_config.gpu_options.allow_growth = True

    with tf.Session(config=tf_config) as sess:

        actor_critic = RandomActorCritic(sess, a2c_arch, ob_space, ac_space,
                                         nenvs, nsteps)

        if a2c_load_path is not None:
            actor_critic.load(a2c_load_path)
            print('Loaded a2c')
        else:
            actor_critic.epsilon = -1
            print('WARNING: No Actor Critic Model loaded. Using Random Agent')

        vae = VariationalAutoEncoder(sess, vae_arch, ob_space, ac_space, lr,
                                     kl_coeff, summarize)

        load_count = 0
        if vae_load_path is not None:
            vae.load(vae_load_path)

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        print('VAE Training Start!')
        print('Model will be saved on intervals of %i' % (log_interval))
        for i in tqdm(range(load_count + 1,
                            int(max_iters) + 1),
                      ascii=True,
                      desc='VarAutoEncoder'):

            mb_s, mb_a, mb_r, mb_ns, mb_d = [], [], [], [], []

            for s, a, r, ns, d in model_play_games(actor_critic, envs, nsteps):
                mb_s.append(s)
                mb_a.append(a)
                mb_r.append(r)
                mb_ns.append(ns)
                mb_d.append(d)

            mb_s = np.concatenate(mb_s)
            mb_a = np.concatenate(mb_a)
            mb_r = np.concatenate(mb_r)
            mb_ns = np.concatenate(mb_ns)
            mb_d = np.concatenate(mb_d)

            if summarize:
                loss, recon_loss, kl_loss, _, smy = vae.train(
                    mb_s, mb_a, mb_ns, mb_r, summary_op)
                writer.add_summary(smy, i)
            else:
                loss, recon_loss, kl_loss, _ = vae.train(
                    mb_s, mb_a, mb_ns, mb_r)

            if i % log_interval == 0:
                vae.save(log_path, i)

        vae.save(log_path, 'final')
        print('Variational AutoEncoder is finished training')
Beispiel #11
0
class Ppo:
    
    def __init__(self, numOfEnvs):
        
        self.testRewards = []
        
#         self.num_envs = 16
#         self.num_envs = numOfEnvs
        self.num_envs = 6
        
        self.env_name = "Pendulum-v0"
        self.env = gym.make(self.env_name)
        
        self.envs = [self.make_env() for i in range(self.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        
        self.num_inputs  = self.envs.observation_space.shape[0]
        self.num_outputs = self.envs.action_space.shape[0]

        #Hyper params:
        self.hidden_size      = 256
        self.lr               = 3e-3

        self.model = ActorCritic(self.num_inputs, self.num_outputs, self.hidden_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

            
    def make_env(self):
        def _thunk():
            env = gym.make(self.env_name)
            return env

        return _thunk        

#     def compute_gae(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    def compute_gae(self, next_value, rewards, masks, values, g, t):
        
        gamma = float(g)
        tau = float(t)

        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
                dist, value = self.model(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

                actor_loss  = - torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        return loss
                
    def plot(self, frame_idx, rewards):
        clear_output(True)
        plt.figure(figsize=(20,5))
        plt.subplot(131)
        plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
        plt.plot(rewards)
        plt.show()
#         plt.savefig("{0}/{1}_rewardGraph.png".format(saveGraphPath, frame_idx))
        
    def test_env(self, vis=False):
        state = self.env.reset()
        if vis: self.env.render()
        done = False
        total_reward = 0
        while not done:
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            dist, _ = self.model(state)
            next_state, reward, done, _ = self.env.step(dist.sample().cpu().numpy()[0])
            state = next_state
            if vis: self.env.render()
            total_reward += reward
        return total_reward
                
    def main(self, inputVals):
        gam = inputVals[0]
        lam = inputVals[1]
        
        print ("Gam: ", gam)
        print ("Lam: ", lam)
        
        num_inputs  = self.envs.observation_space.shape[0]
        num_outputs = self.envs.action_space.shape[0]

        #Hyper params:
#         hidden_size      = 256
#         lr               = 3e-3
        num_steps        = 20
        mini_batch_size  = 5
        ppo_epochs       = 4
        threshold_reward = -200

#         model = a.ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
#         optimizer = optim.Adam(self.model.parameters(), lr=lr)
        
        max_frames = 12000
#         max_frames = 2000
        frame_idx  = 0
        self.test_rewards = []
        
        state = self.envs.reset()
        early_stop = False

        while frame_idx < max_frames and not early_stop:

            log_probs = []
            values    = []
            states    = []
            actions   = []
            rewards   = []
            masks     = []
            entropy = 0

            for _ in range(num_steps):
                state = torch.FloatTensor(state).to(device)
                dist, value = self.model(state)

                action = dist.sample()
                next_state, reward, done, _ = self.envs.step(action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
                masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

                states.append(state)
                actions.append(action)

                state = next_state
                frame_idx += 1

                if frame_idx % 1000 == 0:
                    test_reward = np.mean([self.test_env() for _ in range(10)])
                    self.test_rewards.append(test_reward)
                    self.plot(frame_idx, self.test_rewards)
                    if test_reward > threshold_reward: early_stop = True
                    print ("rewards: ", test_reward)


            next_state = torch.FloatTensor(next_state).to(device)
            _, next_value = self.model(next_state)
            returns = self.compute_gae(next_value, rewards, masks, values, gam, lam)

            returns   = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values    = torch.cat(values).detach()
            states    = torch.cat(states)
            actions   = torch.cat(actions)
            advantage = returns - values

            lastLoss = self.ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
#             print ("loss: ", [lastLoss])
            
#         re = rewards[-1].cpu()
#         print ("RE: ", np.asarray(re))
#         return (np.asarray(re))
        return lastLoss.item()
Beispiel #12
0
global_brain = Brain(actor_critic)

# 정보 저장용 변수 생성
obs_shape = envs.observation_space.shape  # (1, 84, 84)
obs_shape = (obs_shape[0] * NUM_STACK_FRAME,
                *obs_shape[1:])  # (4, 84, 84)
# torch.Size([16, 4, 84, 84])
current_obs = torch.zeros(NUM_PROCESSES, *obs_shape).to(device)
rollouts = RolloutStorage(
    NUM_ADVANCED_STEP, NUM_PROCESSES, obs_shape)  # rollouts 객체
episode_rewards = torch.zeros([NUM_PROCESSES, 1])  # 현재 에피소드에서 받을 보상 저장
final_rewards = torch.zeros([NUM_PROCESSES, 1])  # 마지막 에피소드의 총 보상 저장

if __name__ == "__main__":
    # 초기 상태로 시작  
    obs = envs.reset()
    obs = torch.from_numpy(obs).float()  # torch.Size([16, 1, 84, 84])
    current_obs[:, -1:] = obs  # 4번째 프레임에 가장 최근 관측결과를 저장

    # advanced 학습에 사용할 객체 rollouts에 첫번째 상태로 현재 상태를 저장
    rollouts.observations[0].copy_(current_obs)

     # 주 반복문
    for j in tqdm(range(NUM_UPDATES)):
        # advanced 학습 범위에 들어가는 단계마다 반복
        for step in range(NUM_ADVANCED_STEP):

            # 행동을 결정
            with torch.no_grad():
                action = actor_critic.act(rollouts.observations[step])
Beispiel #13
0
Datei: main.py Projekt: km01/myrl
    a_solver = optim.Adam(actor.parameters(), lr=actor_lr)
    c_solver = optim.Adam(critic.parameters(), lr=critic_lr)

    frame_count = 0
    rewards = [[0.] for _ in range(num_envs)]
    global_rewards = []

    obs_gotten = None

    while frame_count < max_frame:

        cache = {'obs': [], 'acts': [], 'rews': [], 'dones': []}
        probs_cache = {'mu': [], 'sig': []}

        for _ in range(n_steps):
            obs = envs.reset() if obs_gotten is None else obs_gotten
            obs_in = torch.FloatTensor(obs).to(device)
            mu, sig = actor(obs_in)
            with torch.no_grad():
                a = Normal(mu, sig).sample()
                a.clamp_(-2.0 + 1e-7, 2.0 - 1e-7)

            obs_gotten, rews, dones, _ = envs.step(a)

            for i in range(num_envs):
                rewards[i][-1] += rews[i]
                if dones[i]:
                    global_rewards.append(rewards[i][-1])
                    rewards[i].append(0.)

            cache['obs'].append(obs)