Ejemplo n.º 1
0
def run_batch_episode_exp(total_eps: int, update_every: int,
                          wandb_project: str, wandb_group: str):
    # NOTE:
    # This code doesn't run properly on Windows 10.
    # The result can be reproduced on Ubuntu and Mac OS.

    config = dict()
    config['update_every'] = update_every

    wandb.init(project=wandb_project,
               entity='junyoung-park',
               reinit=True,
               group=wandb_group,
               config=config)

    env = gym.make('CartPole-v1')
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.n

    policy_net = MLP(s_dim, a_dim, [128])
    value_net = MLP(s_dim, 1, [128])
    agent = TDActorCritic(policy_net, value_net)
    memory = EpisodicMemory(max_size=100, gamma=1.0)
    n_update = 0

    wandb.watch(agent)

    for ep in range(total_eps):
        s = env.reset()
        cum_r = 0

        while True:
            s = to_tensor(s, size=(1, 4))
            a = agent.get_action(s)
            ns, r, done, info = env.step(a.item())

            # preprocess data
            r = torch.ones(1, 1) * r
            done = torch.ones(1, 1) * done

            memory.push(s, a.view(-1, 1), r, to_tensor(ns, size=(1, 4)), done)

            s = ns
            cum_r += r
            if done:
                break
        if ep % update_every == 0:
            s, a, r, ns, done, _ = memory.get_samples()
            agent.update(state=s.float(),
                         action=a.float(),
                         reward=r.float(),
                         next_state=ns.float(),
                         done=done)
            memory.reset()
            n_update += 1
        wandb.log({"episode return": cum_r, "num_update": n_update})
Ejemplo n.º 2
0
 def __init__(self):
     super(Critic, self).__init__()
     self.state_encoder = MLP(3, 64, num_neurons=[],
                              out_act='ReLU')  # single layer model
     self.action_encoder = MLP(1, 64, num_neurons=[],
                               out_act='ReLU')  # single layer model
     self.q_estimator = MLP(128,
                            1,
                            num_neurons=[32],
                            hidden_act='ReLU',
                            out_act='Identity')
Ejemplo n.º 3
0
 def __init__(self):
     super(Actor, self).__init__()
     self.mlp = MLP(3,
                    1,
                    num_neurons=[128, 64],
                    hidden_act='ReLU',
                    out_act='Identity')
Ejemplo n.º 4
0
def run_minibatch_fullbatch(num_reps : int, 
                            n_samples : int, 
                            batch_size : int, 
                            epoch : int):
    criteria = torch.nn.MSELoss()
    
    sgd_losses = []
    gd_losses = []
    
    for _ in range(num_reps):
        mlp = MLP(input_dim=1, 
                  output_dim=1,
                  num_neurons=[64],
                  hidden_act='Identity',
                  out_act='Identity')
        
        opt = torch.optim.Adam(params=mlp.parameters(), lr=1e-3)

        mlp2 = MLP(input_dim=1, 
                   output_dim=1,
                   num_neurons=[64],
                   hidden_act='Identity',
                   out_act='Identity')
        mlp2.load_state_dict(mlp.state_dict())
        opt2 = torch.optim.Adam(params=mlp2.parameters(), lr=1e-3)

        xs, ys = generate_samples(n_samples)
        ds = torch.utils.data.TensorDataset(xs, ys)
        data_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size)
        full_loader = torch.utils.data.DataLoader(ds, batch_size=n_samples)
        
        # SGD - Mini batch
        sgd_loss = train_model(mlp, opt, data_loader, epoch, criteria, xs, ys)        
        sgd_losses.append(sgd_loss)
        
        # GD - Full batch
        gd_loss = train_model(mlp2, opt2, full_loader, epoch, criteria, xs, ys)        
        gd_losses.append(gd_loss)
    
    sgd_losses = np.stack(sgd_losses)
    gd_losses = np.stack(gd_losses)
    return sgd_losses, gd_losses
Ejemplo n.º 5
0
def run_exp(total_eps: int, wandb_project: str, wandb_group: str):
    # NOTE:
    # This code doesn't run properly on Windows 10
    # The result can be reproduced on Ubuntu and Mac OS.

    config = dict()
    config['sample_update'] = True
    wandb.init(project=wandb_project,
               entity='junyoung-park',
               reinit=True,
               group=wandb_group,
               config=config)

    env = gym.make('CartPole-v1')
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.n

    policy_net = MLP(s_dim, a_dim, [128])
    value_net = MLP(s_dim, 1, [128])
    agent = TDActorCritic(policy_net, value_net)
    n_update = 0

    for ep in range(total_eps):
        s = env.reset()
        cum_r = 0

        while True:
            s = to_tensor(s, size=(1, 4))
            a = agent.get_action(s)
            ns, r, done, info = env.step(a.item())

            ns = to_tensor(ns, size=(1, 4))
            agent.update(s, a.view(-1, 1), r, ns, done)

            s = ns.numpy()
            cum_r += r
            n_update += 1
            if done:
                break

        wandb.log({"episode return": cum_r, "num_update": n_update})
Ejemplo n.º 6
0
            self.s = None
            self.alpha = alpha

        def update(self, y):
            if self.s is None:
                self.s = y
            else:
                self.s = self.alpha * y + (1 - self.alpha) * self.s

    env = gym.make('CartPole-v1')
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.n

    qnet = MLP(input_dim=s_dim,
               output_dim=a_dim,
               num_neurons=[128],
               hidden_act='ReLU',
               out_act='Identity')

    agent = NaiveDQN(state_dim=s_dim,
                     action_dim=a_dim,
                     qnet=qnet,
                     lr=1e-4,
                     gamma=1.0,
                     epsilon=1.0)

    n_eps = 10000
    print_every = 500
    ema_factor = 0.5
    ema = EMAMeter(ema_factor)
Ejemplo n.º 7
0
            dist = Categorical(logits=self.policy(s))
            prob = dist.probs[a]

            # Don't forget to put '-' in the front of pg_loss !!!!!!!!!!!!!!!!
            # the default behavior of pytorch's optimizer is to minimize the targets
            # add 'self_eps' to prevent numerical problems of logarithms
            # 여기서의 loss는 maximize가 되어야 하기 때문에 앞에 -를 붙여준다
            pg_loss = - torch.log(prob + self._eps) * g

            self.opt.zero_grad()

            pg_loss.backward()
            self.opt.step()
```
'''
net = MLP(s_dim, a_dim, [128])
agent = REINFORCE(net)
ema = EMAMeter()

n_eps = 10000
print_every = 500

for ep in range(n_eps):
    s = env.reset()
    cum_r = 0

    states = []
    actions = []
    rewards = []

    while True:
Ejemplo n.º 8
0
        self.optimizer.step()


if __name__ == '__main__':
    import gym
    import torch

    from src.part3.MLP import MultiLayerPerceptron as MLP
    from src.part4.ActorCritic import TDActorCritic
    from src.common.train_utils import EMAMeter, to_tensor

    env = gym.make('CartPole-v1')
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.n

    policy_net = MLP(s_dim, a_dim, [128])
    value_net = MLP(s_dim, 1, [128])

    agent = TDActorCritic(policy_net, value_net)
    ema = EMAMeter()

    n_eps = 10000
    print_every = 500

    for ep in range(n_eps):
        s = env.reset()
        cum_r = 0

        while True:
            s = to_tensor(s, size=(1, 4))
            a = agent.get_action(s).view(-1, 1)
Ejemplo n.º 9
0
이 실습에 쓰인 하이퍼 파라미터는 https://github.com/seungeunrho/minimalRL/blob/master/dqn.py 에서 제안된 값들을 사용하였습니다.
'''
lr = 1e-4 * 5
batch_size = 256
gamma = 1.0
memory_size = 50000
total_eps = 3000
# eplison은 시간이 지남에 따라 감소시키도록 한다
eps_max = 0.08
eps_min = 0.01
# 초반에 학습을 시작하기 전에 먼저 샘플을 2000번까지 모아놓기로 한다(불안정성을 방지하기 위해)
sampling_only_until = 2000
target_update_interval = 10

# Q-network와 Q_target-network를 Multi layers perceptron으로 구현
qnet = MLP(4, 2, num_neurons=[128])
qnet_target = MLP(4, 2, num_neurons=[128])

# 초기의 target network와 main network는 동일하다
# initialize target network same as the main network.
qnet_target.load_state_dict(qnet.state_dict())
agent = DQN(4,
            1,
            qnet=qnet,
            qnet_target=qnet_target,
            lr=lr,
            gamma=gamma,
            epsilon=1.0)
env = gym.make('CartPole-v1')
memory = ReplayMemory(memory_size)
Ejemplo n.º 10
0
def run_DQN(batch_size: int, target_update_interval: int, wandb_project: str):

    # the hyperparameters are taken from 'minimalRL' implementation
    # https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
    # the usage is under agreement with the original author.

    lr = 1e-4 * 5
    batch_size = batch_size
    gamma = 1.0
    memory_size = 50000
    total_eps = 3000
    eps_max = 0.08
    eps_min = 0.01
    sampling_only_until = 2000

    config = dict()
    config['lr'] = lr
    config['batch_size'] = batch_size
    config['target_update_interval'] = target_update_interval
    config['total_eps'] = total_eps
    config['eps_max'] = eps_max
    config['eps_min'] = eps_min
    config['sampling_only_until'] = sampling_only_until

    wandb.init(project=wandb_project,
               entity='junyoung-park',
               reinit=True,
               config=config)

    qnet = MLP(4, 2, num_neurons=[128])
    qnet_target = MLP(4, 2, num_neurons=[128])
    # initialize target network same as the main network.
    qnet_target.load_state_dict(qnet.state_dict())

    agent = DQN(4,
                1,
                qnet=qnet,
                qnet_target=qnet_target,
                lr=lr,
                gamma=gamma,
                epsilon=1.0)
    wandb.watch(agent)

    env = gym.make('CartPole-v1')
    memory = ReplayMemory(memory_size)

    for n_epi in range(total_eps):
        # epsilon scheduling
        # slowly decaying_epsilon
        epsilon = max(eps_min, eps_max - eps_min * (n_epi / 200))
        agent.epsilon = torch.tensor(epsilon)
        s = env.reset()
        cum_r = 0

        while True:
            s = to_tensor(s, size=(1, 4))
            a = agent.get_action(s)
            ns, r, done, info = env.step(a)

            experience = (s, torch.tensor(a).view(1, 1), torch.tensor(
                r / 100.0).view(1, 1), torch.tensor(ns).view(1, 4),
                          torch.tensor(done).view(1, 1))
            memory.push(experience)

            s = ns
            cum_r += r
            if done:
                break

        if len(memory) >= sampling_only_until:
            # train agent
            sampled_exps = memory.sample(batch_size)
            sampled_exps = prepare_training_inputs(sampled_exps)
            agent.update(*sampled_exps)

        if n_epi % target_update_interval == 0:
            qnet_target.load_state_dict(qnet.state_dict())

        log_dict = dict()
        log_dict['cum_r'] = cum_r
        log_dict['epsilon'] = epsilon

        wandb.log(log_dict)

    torch.save(agent.state_dict(), join(wandb.run.dir, "agent.pt"))
    wandb.join()
Ejemplo n.º 11
0
> 모델을 train 모드로 바꾸기
```python
model.train()
```

> 모델을 eval 모드로 바꾸기
```python
model.eval()
```
'''
## 다층 퍼셉트론 모델 만들기

mlp = MLP(input_dim=1, 
          output_dim=1,
          num_neurons=[64,32,32],
          hidden_act='ReLU',
          out_act='Identity')

mlp

## model.state_dict()
'''
`model.state_dict()` 는 모델의 파라미터 값 및 buffer의 현재 상태등을 저장해놓은 dictionary 입니다.
차후에 <5. 심층강화학습> 자주 사용하게 될건데요. 
많은 경우, 훈련중 혹은 훈련이 끝난 모델의 파라미터를 구한 후 디스크에 저장하는 용도로 사용하게 됩니다.
이 경우에는 똑같은 모델 두개를 만들기 위해서 사용해볼까요?
'''
mlp.state_dict()

mlp2 = MLP(input_dim=1,