Ejemplo n.º 1
0
def main(args):
    env = load_env(args)
    model = Model(env.observation_space.shape[0], env.action_space.n, args)
    dq = deque(maxlen=100)
    dq.append(0)

    for episode in range(1, args.episodes + 1):
        h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float))
        state = env.reset()
        done = False
        score = 0.

        while not done:
            for _ in range(args.n_rollout):
                h_in = h_out
                prob, hidden_out = model._pi(torch.from_numpy(state).float(), h_in)
                prob = prob.squeeze()
                m = Categorical(prob)
                action = m.sample().item()

                next_state, reward, done, _ = env.step(action)
                model._put_data(state, next_state, reward, done, action, prob[action], hidden_in=h_in, hidden_out=hidden_out)
                state = next_state
                score += reward
                if done:
                    break
            model._train()
        dq.append(score)

        if episode % args.print_intervals == 0:
            print('Episode: {}, Score mean: {}'.format(episode, np.mean(dq)))
def main(args):
    env = load_env(args)
    model = Model(env.observation_space.shape[0], env.action_space.n, args)
    dq = deque(maxlen=100)
    dq.append(0)

    for episode in range(1, args.episodes + 1):
        state = env.reset()
        done = False
        score = 0.

        while not done:
            state = torch.from_numpy(state).float()

            prob = model(state)
            m = Categorical(prob)
            action = m.sample().item()

            next_state, reward, done, _ = env.step(action)
            score += 1.

            model._put_data(reward, prob[action])
            state = next_state
        model._train()
        dq.append(score)

        if episode % args.print_intervals == 0 and episode != 0:
            print("# of episode :{}, avg score : {}".format(episode, np.mean(dq), axis=-1))
Ejemplo n.º 3
0
def main(args):
    env = load_env(args)
    model = Model(env.observation_space.shape[0], env.action_space.n, args)
    dq = deque(maxlen=100)
    dq.append(0)

    for episode in range(1, args.episodes + 1):
        state = env.reset()
        done = False
        score = 0.

        while not done:
            for _ in range(args.n_rollout):
                state = torch.from_numpy(state).float()

                prob = model._pi(state)
                m = Categorical(prob)
                action = m.sample().item()

                next_state, reward, done, _ = env.step(action)
                model._put_data(state.numpy(), next_state, reward, done,
                                action, prob[action])
                state = next_state

                score += 1.
                if done:
                    break
            model._train()
        dq.append(score)

        if episode % args.print_intervals == 0:
            print('Episdoe: {}, Score Mean: {}'.format(episode, np.mean(dq)))
def _train(global_model, rank, args):
    env = load_env(args)
    local_model = Model(env.observation_space.shape[0], env.action_space.n, args)

    local_model.load_state_dict(global_model.state_dict())
    optimizer = optim.Adam(global_model.parameters(), lr=args.lr)
    dq = deque(maxlen=100)
    dq.append(0)

    for episode in range(args.episodes):
        done = False
        state = env.reset()
        score = 0.

        while not done:
            for _ in range(args.n_rollout):
                state = torch.from_numpy(state).float()

                prob = local_model._pi(state)
                m = Categorical(prob)
                action = m.sample().item()

                next_state, reward, done, _ = env.step(action)
                local_model._put_data(state.numpy(), next_state, reward, done, action, prob[action])
                score += 1
                if done:
                    break
                state = next_state
            local_model._train(optimizer)
            for global_param, local_param in zip(global_model.parameters(), local_model.parameters()):
                global_param._grad = local_param.grad
        dq.append(score)
        local_model.load_state_dict(global_model.state_dict())
        if episode % args.print_intervals == 0:
            print('[Episode: {}, Rank: {}] score: {}'.format(episode, rank, np.mean(dq)))
Ejemplo n.º 5
0
def main(args):
    env = load_env(args)
    model = Model(env.observation_space.shape[0], env.action_space.n, args)
    memory = ReplayBuffer(args)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    for episode in range(1, args.episodes + 1):
        state = env.reset()
        done = False

        while not done:
            seq_data = []
            for _ in range(args.n_rollout):
                prob = model._pi(torch.from_numpy(state).float())
                m = Categorical(prob)
                action = m.sample().item()

                next_state, reward, done, _ = env.step(action)
                seq_data.append((state, next_state, reward / 100., done, action, prob))
                state = next_state

                if done:
                    break
            memory._put_data(seq_data)
            if memory._size() > 500:
                _train(model, optimizer, memory, on_policy=True)
                _train(model, optimizer, memory)
Ejemplo n.º 6
0
def main(args):
    env = load_env(args)

    in_channels, out_channels = env.observation_space.shape[
        0], env.action_space.n
    print(in_channels, out_channels)
    dq = deque(maxlen=100)
    dq.append(0)

    model = ActorCritic(in_channels, out_channels, args)
    if args.is_cuda:
        model = model.cuda()

    rewards_list = []
    episodes_list = []
    losses_list = []
    for episode in range(args.episodes):
        score = 0.
        done = False
        state = env.reset()

        loss = 0.
        while not done:
            for t in range(args.n_rollout):
                state = torch.from_numpy(state).float()
                if args.is_cuda:
                    state = state.cuda()

                prob = model._pi(state)
                m = Categorical(prob)
                action = m.sample().item()

                next_state, reward, done, _ = env.step(action)
                model._history(state, next_state, reward, done, action,
                               prob[action])
                state = next_state

                score += reward
                if done:
                    break
            loss += model._train()
        rewards_list.append(score)
        dq.append(score)
        losses_list.append(loss)
        episodes_list.append(episode)
        if episode % args.print_intervals == 0 and episode != 0:
            print("# of episode :{0}, avg score : {1:.2f}".format(episode,
                                                                  np.mean(dq),
                                                                  axis=-1))

        if not os.path.isdir('./actor_critic'):
            os.mkdir('actor_critic')
        draw_plot(episodes_list,
                  losses_list,
                  rewards_list,
                  path='./actor_critic/loss_reward_plot.jpg')
Ejemplo n.º 7
0
def main(args):
    lr_mu = 0.0005
    lr_q = 0.001
    gamma = 0.99
    batch_size = 32
    buffer_limit = 50000
    tau = 0.005  # for target network soft update

    args.env = 'Pendulum-v0'
    env = load_env(args)
    in_channels = env.observation_space.shape[0]
    memory = ReplayBuffer(args)

    q = QNet(in_channels, args)
    mu = MuNet(in_channels, args)

    q_target = QNet(in_channels, args)
    mu_target = MuNet(in_channels, args)

    q_target.load_state_dict(q.state_dict())
    mu_target.load_state_dict(mu.state_dict())

    q_optimizer = optim.Adam(q.parameters(), lr=lr_q)
    mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
    ou_noise = OrnsteinUhlenbeckNoise(np.zeros(1))

    score = 0.
    for episode in range(1, args.episodes):
        state = env.reset()

        for t in range(300):
            state = torch.from_numpy(state).float()
            action = mu(state)
            action = action.item() + ou_noise()[0]

            next_state, reward, done, info = env.step([action])
            memory._put_data(
                (state.numpy(), next_state, reward / 100., done, action))
            score += reward
            state = next_state

            if done:
                break

        if memory.size() > 2000:
            for i in range(10):
                _train(mu, mu_target, q, q_target, memory, q_optimizer,
                       mu_optimizer, args)
                soft_update(mu, mu_target)
                soft_update(q, q_target)

        if episode % 20 == 0 and episode != 0:
            print("# of episode :{}, avg score : {:.1f}".format(
                episode, score / 20))
            score = 0.0
Ejemplo n.º 8
0
def main(args):
    env = load_env(args)
    model = Model(env.observation_space.shape[0], env.action_space.n, args)
    dq = deque(maxlen=100)
    dq.append(0)

    episodes_list = []
    losses_list = []
    rewards_list = []

    for episode in range(1, args.episodes + 1):
        state = env.reset()
        done = False
        score = 0.
        losses = 0.

        while not done:
            for _ in range(args.n_rollout):
                state = torch.from_numpy(state).float()

                prob = model._pi(state)
                m = Categorical(prob)
                action = m.sample().item()

                next_state, reward, done, _ = env.step(action)
                score += 1.
                model._put_data(state.numpy(), next_state, reward, done,
                                action, prob[action])
                state = next_state

                if done:
                    break
            losses += model._train()
        dq.append(score)

        if episode % args.print_intervals == 0 and episode != 0:
            print("# of episode :{}, avg score : {}".format(episode,
                                                            np.mean(dq),
                                                            end=' '))

        episodes_list.append(episode)
        losses_list.append(losses)
        rewards_list.append(score)

        if not os.path.isdir('./ppo'):
            os.mkdir('ppo')
        draw_plot(episodes_list,
                  losses_list,
                  rewards_list,
                  path='./ppo/loss_reward_plot.jpg')
def main(args):
    env = load_env(args)

    global_model = Model(env.observation_space.shape[0], env.action_space.n, args)
    global_model.share_memory()

    processes = []
    for rank in range(args.world_size):
        p = mp.Process(target=_train, args=(global_model, rank, args))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
Ejemplo n.º 10
0
def main(args):
    env = load_env(args)
    dq = deque(maxlen=100)
    dq.append(0)

    model = Model(env.observation_space.shape[0], env.action_space.n, args)
    if args.is_cuda:
        model = model.cuda()
    episodes_list = []
    losses_list = []
    rewards_list = []

    for episode in range(args.episodes):
        state = env.reset()
        done = False
        score = 0.

        while not done:
            # env.render()
            state = torch.from_numpy(state).float()
            if args.is_cuda:
                state = state.cuda()

            prob = model(state)
            m = Categorical(prob)
            action = m.sample()

            next_state, reward, done, info = env.step(action.item())
            model._put_data(state.numpy(), reward, prob[action], done)
            score += reward

            state = next_state
        losses = model._train()
        dq.append(score)
        if episode % args.print_intervals == 0 and episode != 0:
            print("# of episode :{}, avg score : {}".format(episode,
                                                            np.mean(dq),
                                                            axis=-1))

        episodes_list.append(episode)
        losses_list.append(losses)
        rewards_list.append(score)

    if not os.path.isdir('./reinforce'):
        os.mkdir('reinforce')
    draw_plot(episodes_list,
              losses_list,
              rewards_list,
              path='./reinforce/loss_reward_plot.jpg')
Ejemplo n.º 11
0
def main(args):
    env = load_env(args)
    in_channels, out_channels = env.observation_space.shape[
        0], env.action_space.n

    q = Model(in_channels, out_channels, args)
    q_target = Model(in_channels, out_channels, args)

    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer(args)

    score = 0.
    optimizer = optim.Adam(q.parameters(), lr=args.lr)

    for episode in range(1, args.episodes + 1):
        epsilon = max(0.01, 0.08 - 0.01 * (episode / 200))

        state = env.reset()
        done = False

        while not done:
            state = torch.from_numpy(state).float()
            action = q._sample_action(state, epsilon)

            next_state, reward, done, _ = env.step(action)
            done_mask = 0. if done else 1.
            memory._put_data(
                (state.numpy(), next_state, reward / 100., action, done_mask))

            state = next_state

            score += 1.
            if done:
                break

        if memory.size() > 2000:
            _train(q, q_target, memory, optimizer, args)

        if episode % args.print_intervals == 0:
            q_target.load_state_dict(q.state_dict())
            print(
                "n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".
                format(episode, score / args.print_intervals, memory.size(),
                       epsilon * 100))
            score = 0.
Ejemplo n.º 12
0
def main(args):
    dq = deque(maxlen=100)
    dq.append(0)

    env = load_env(args)
    in_channels, out_channels = env.observation_space.shape[0], env.action_space.n
    model = Model(in_channels, out_channels, args)
    if args.is_cuda:
        model = model.cuda()

    rewards_list = []
    episodes_list = []
    losses_list = []
    for episode in range(args.episodes):
        score = 0.
        state = env.reset()
        done = False

        while not done:
            for _ in range(args.n_rollout):
                state = torch.from_numpy(state).float()
                if args.is_cuda:
                    state = state.cuda()

                prob = model._pi(state)
                m = Categorical(prob)
                action = m.sample().item()

                next_state, reward, done, _ = env.step(action)
                model._put_data(state, next_state, reward, done, action, prob[action])
                score += reward

                state = next_state

                if done:
                    break
            loss = model._train()
            losses_list.append(loss)
        dq.append(score)
        episodes_list.append(episode)

        if episode % args.print_intervals == 0 and episode != 0:
            print("# of episode :{}, avg score : {}".format(episode, np.mean(dq), end=' '))
            print(np.mean(np.array(losses_list)))
Ejemplo n.º 13
0
def main(args):
    env = load_env(args)
    env.seed(500)
    torch.manual_seed(500)

    in_channels, out_channels = env.observation_space.shape[
        0], env.action_space.shape[0]
    print(in_channels, out_channels)

    actor = Actor(in_channels, out_channels, args)
    critic = Critic(in_channels, out_channels, args)
    target_critic = Critic(in_channels, out_channels, args)

    actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr)

    hard_target_update(critic, target_critic)

    target_entropy = -torch.prod(torch.tensor(out_channels)).item()
    log_alpha = torch.zeros(1, requires_grad=True)
    alpha = torch.exp(log_alpha)
    alpha_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr)

    replay_buffer = deque(maxlen=100000)
    recent_rewards = deque(maxlen=100)
    steps = 0

    for episode in range(args.max_iter_num):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, in_channels])

        while not done:
            steps += 1

            mu, std = actor(torch.tensor(state).float())
            action = get_action(mu, std)

            next_state, reward, done, _ = env.step(action)

            next_state = np.reshape(next_state, [1, in_channels])
            mask = 0. if done else 1.

            replay_buffer.append((state, action, reward, next_state, mask))

            state = next_state
            score += reward

            if steps > args.batch_size:
                mini_batch = random.sample(replay_buffer, args.batch_size)

                actor.train(), critic.train(), target_critic.train()
                alpha = _train_model(actor, critic, target_critic, mini_batch,
                                     actor_optimizer, critic_optimizer,
                                     alpha_optimizer, target_entropy,
                                     log_alpha, alpha, args)

                soft_target_update(critic, target_critic, args.tau)

            if done:
                recent_rewards.append(score)

        if episode % args.log_interval == 0:
            print('{} episode | score_avg: {:.2f}'.format(
                episode, np.mean(recent_rewards)))

        if np.mean(recent_rewards) > args.goal_score:
            # if not os.path.isdir(args.save_path):
            #     os.makedirs(args.save_path)

            # ckpt_path = args.save_path + 'model.pth.tar'
            # torch.save(actor.state_dict(), ckpt_path)
            print('Recent rewards exceed -300. So end')
            break