Ejemplo n.º 1
0
class Agent:
    def __init__(self,
                 state_size,
                 num_actions,
                 mode,
                 buffer_size=None,
                 **_kwargs):
        assert mode in ('train', 'test')
        self.mode = mode
        self.state_size = state_size
        self.num_actions = num_actions

        self.buffer = ReplayBuffer(buffer_size)

    def get_action(self, state: np.ndarray):
        raise NotImplemented

    def get_q_values(self, state: np.ndarray) -> np.ndarray:
        raise NotImplemented

    def optimize(self):
        raise NotImplemented

    def save_model(self, model_save_path: str):
        raise NotImplemented

    def store_transition(self, transition):
        self.buffer.append(transition)

    def save_history(self, path):
        verify_output_path(path)
        history = {
            field: np.array([
                transition.__getattribute__(field)
                for transition in self.buffer.data
            ])
            for field in Transition._fields
        }
        np.savez(path, **history)
Ejemplo n.º 2
0
def main():

    parser = argparse.ArgumentParser(description='VDQN')
    parser.add_argument('--seed', type=int, default=100)
    parser.add_argument('--env',
                        type=str,
                        default='CartPole-v0',
                        help='Name of the OpenAI Gym environment')
    parser.add_argument('--logdir', type=str, default='')
    parser.add_argument('--episodes', type=int, default=200)
    parser.add_argument('--target-update-period', type=int, default=100)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=.99)
    args = parser.parse_args()

    #### HYPERPARAMETERS
    episodes = args.episodes  #1000

    envname = args.env

    seed = args.seed
    tf.set_random_seed(seed)
    np.random.seed(seed)

    hiddendict = [100, 100]
    sigma = 0.01
    Wpriorsigma = [10000] * 2
    bpriorsigma = [10000] * 2
    batchsize = 64
    buffersize = 1000000
    initialsize = 500
    tau = 1.0
    target_update_period = args.target_update_period
    lr_VI = args.lr
    gamma = args.gamma
    totalstep = 0
    reward_scale = 1
    ############
    #### MAIN ITERATIONS
    ###########
    logdir = args.logdir + 'VDQN/' + envname + '/lr_' + str(
        args.lr) + '_episodes' + str(args.episodes)

    if not os.path.exists(logdir):
        os.makedirs(logdir)

    with tf.Session() as sess:

        ### INITIALIZATION
        env = gym.make(envname)
        obssize = env.observation_space.low.size
        actsize = env.action_space.n
        replaybuffer = ReplayBuffer(buffersize)
        Qactionnet = VariationalQNetwork(
            obssize,
            actsize,
            hiddendict,
            sess=sess,
            scope='principle',
            optimizer=tf.train.AdamOptimizer(lr_VI))
        Qtargetnet = VariationalQNetwork(obssize,
                                         actsize,
                                         hiddendict,
                                         sess=sess,
                                         scope='target')
        noisesampler = NoiseSampler(Qactionnet.Wshape, Qactionnet.bshape)

        sess.run(tf.global_variables_initializer())

        update_target(Qtargetnet, Qactionnet)

        ### RECORD
        VIlossrecord = []
        Bellmanlossrecord = []
        rewardrecord = []

        ### ITERATIONS
        for episode in range(episodes):

            # start
            obs = env.reset()
            done = False

            rsum = 0
            while not done:
                # sample a noise and compute
                Wnoise, bnoise = noisesampler.sample(1)
                # compuet Q value
                Qvalue = Qactionnet.compute_Qvalue(obs[None], Wnoise, bnoise)
                # select action
                action = np.argmax(Qvalue.flatten())
                # step
                nextobs, reward, done, _ = env.step(action)
                # record experience
                done_ = 1 if done else 0
                reward_ = reward * reward_scale
                experience = [(obs, action, reward_, done_, nextobs)]
                # append experience to buffer
                replaybuffer.append(experience)
                replaybuffer.popleft()
                # update
                obs = nextobs
                totalstep += 1
                rsum += reward

                if replaybuffer.currentsize >= initialsize:
                    # sample minibatch
                    batch_obs, batch_act, batch_reward, batch_done, batch_nextobs = replaybuffer.sample(
                        batchsize)
                    # sample noise for computing target
                    Wnoise, bnoise = noisesampler.sample(batchsize)
                    # compute target value
                    Qall = Qtargetnet.compute_Qvalue(batch_nextobs, Wnoise,
                                                     bnoise)
                    Qtarget = gamma * np.max(
                        Qall, axis=1) * (1 - batch_done) + batch_reward
                    # udpate principle network by VI
                    VIloss = Qactionnet.train_on_sample(
                        batch_obs, batch_act, Qtarget)
                    # comptue bellman error loss
                    #Wnoise_new,bnoise_new = noisesampler.sample(batchsize)
                    Wnoise_new, bnoise_new = Wnoise, bnoise
                    Qpred = Qactionnet.compute_Qvalue(batch_obs, Wnoise_new,
                                                      bnoise_new)
                    Qpredact = Qpred[np.arange(batchsize), batch_act]
                    Bellmanloss = np.mean((Qpredact - Qtarget)**2)
                    #a,b,c,d = Qactionnet.get_variables()
                    #print('Wmu',a,'Wrho',b,'bmu',c,'brho',d)
                    #print(Qpredact,Qtarget)
                    # record
                    #print('bellmanerror',Bellmanloss)
                    #raise ValueError
                    VIlossrecord.append(VIloss['loss'])
                    Bellmanlossrecord.append(Bellmanloss)

                if (totalstep + 1) % target_update_period == 0:
                    update_target(Qtargetnet, Qactionnet)
                    print("update target")

                if done:
                    break

            # record
            rewardrecord.append(rsum)

            ### TRAIN

            meanVIloss = np.mean(
                VIlossrecord[-10:]) if len(VIlossrecord) > 10 else np.float(
                    'nan')
            meanbellmanloss = np.mean(Bellmanlossrecord[-10:]) if len(
                Bellmanlossrecord) > 10 else np.float('nan')
            meanreward = np.mean(rewardrecord[-10:])

            print(
                "episode %d buffer size %d meanVIloss %f meanbellmanloss %f meanreward %f"
                % (episode, replaybuffer.currentsize, meanVIloss,
                   meanbellmanloss, meanreward))

            if (1 + episode) % 5 == 0:
                np.save(logdir + '/VIloss_' + str(seed), VIlossrecord)
                np.save(logdir + '/bellmanloss_' + str(seed),
                        Bellmanlossrecord)
                np.save(logdir + '/reward_' + str(seed), rewardrecord)
class DDPGAgent(Agent):
    def __init__(self,
                 state_space,
                 action_space,
                 device,
                 actor_lr=0.000025,
                 critic_lr=0.00025,
                 tau=0.001,
                 gamma=0.99,
                 max_size=1000000,
                 layer1_size=200,
                 layer2_size=150,
                 batch_size=64,
                 noise_std=0.1,
                 name="DDPG"):
        assert isinstance(
            action_space,
            gym.spaces.Box)  ### NEW: The action space is now continuous
        super().__init__(state_space, action_space, device=device, name=name)
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(max_size, self.device)
        self.batch_size = batch_size

        self.actor = DefaultNN(actor_lr,
                               self.state_size,
                               layer1_size,
                               layer2_size,
                               self.nb_actions,
                               self.device,
                               last_activation=torch.tanh)
        self.critic = DefaultNN(critic_lr, self.state_size + self.nb_actions,
                                layer1_size, layer2_size, 1, self.device)

        self.target_actor = copy.deepcopy(self.actor)
        self.target_critic = copy.deepcopy(self.critic)

        self.normal_distribution = torch.distributions.normal.Normal(
            torch.zeros(self.nb_actions),
            torch.full((self.nb_actions, ), noise_std))

    def action(self, observation):
        with torch.no_grad():
            observation = torch.tensor(observation,
                                       dtype=torch.float).to(self.device)
            actor_output = self.actor.forward(observation).to(self.device)
            noise = self.normal_distribution.sample()
            action = actor_output + noise
        return action.cpu().detach().numpy()

    def on_action_stop(self, action, new_state, reward, done):
        self.replay_buffer.append(self.last_state, action, reward, new_state,
                                  done)
        self.learn()
        super().on_action_stop(action, new_state, reward, done)

    def learn(self):
        if len(self.replay_buffer) > self.batch_size:
            states, actions, rewards, new_states, dones = self.replay_buffer.sample(
                self.batch_size)

            with torch.no_grad():
                target_actions = self.target_actor.forward(new_states)
                critic_value_ = self.target_critic.forward(
                    torch.concat((new_states, target_actions), dim=-1))
            critic_value = self.critic.forward(
                torch.concat((states, actions), dim=-1))
            target = torch.addcmul(rewards, self.gamma, 1 - dones,
                                   critic_value_.squeeze()).view(
                                       self.batch_size, 1)
            self.critic.optimizer.zero_grad()
            critic_loss = torch.nn.functional.mse_loss(target, critic_value)
            critic_loss.backward()
            self.critic.optimizer.step()

            self.actor.optimizer.zero_grad()
            actions = self.actor.forward(states)
            actor_loss = -self.critic.forward(
                torch.concat((states, actions), dim=-1))
            actor_loss = torch.mean(actor_loss)
            actor_loss.backward()
            self.actor.optimizer.step()

            self.target_critic.converge_to(self.critic, tau=self.tau)
            self.target_actor.converge_to(self.actor, tau=self.tau)
Ejemplo n.º 4
0
for hparam in hparams.trials(5):
    exp.add_argparse_meta(hparam)
    for timestep in range(hparam.num_steps):
        noise = Normal(
            mean=Variable(torch.zeros(A)),
            std=hparam.noise_factor * Variable(torch.ones(A)),
        )

        if timestep % 1000 == 0:
            hparam.noise_factor /= 2

        a = actor(s) + noise.sample()
        succ, r, done, _ = env.step(a.data.numpy())
        succ = np_to_var(succ)
        buffer.append(Step(s, a, r, succ, done))
        rews.append(r)
        s = np_to_var(env.reset()) if done else succ
        if done:

            exp.add_metric_row({"Timestep": timestep + 1, "Loss": -sum(rews)})

            rews = []

        if len(buffer) >= hparam.batch_size:
            states, actions, rewards, succ_states, dones = format_batch(
                buffer.sample(hparam.batch_size)
            )

            td_estims = get_critic_train_data(succ_states, rewards, dones)