Beispiel #1
0
    def __init__(self, n_action, init_epsilon, final_epsilon, gamma,
                 buffer_size, batch_size, replace_iter, annealing,
                 learning_rate, ctx):
        self.n_action = n_action
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.final_epsilon = final_epsilon
        # discount factor
        self.gamma = gamma
        # memory buffer size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        # replace the parameters of the target network every T time steps
        self.replace_iter = replace_iter
        # The number of step it will take to linearly anneal the epsilon to its min value
        self.annealing = annealing
        self.learning_rate = learning_rate
        self.ctx = ctx

        self.total_steps = 0
        self.replay_buffer = MemoryBuffer(self.buffer_size, ctx)  # use deque

        # build the network
        self.target_network = DoubleQNetwork(n_action)
        self.main_network = DoubleQNetwork(n_action)
        self.target_network.collect_params().initialize(
            init.Xavier(), ctx=ctx)  # initialize the params
        self.main_network.collect_params().initialize(init.Xavier(), ctx=ctx)

        # optimize the main network
        self.optimizer = gluon.Trainer(self.main_network.collect_params(),
                                       'adam',
                                       {'learning_rate': self.learning_rate})
Beispiel #2
0
    def __init__(self, action_dim, action_bound, actor_learning_rate,
                 critic_learning_rate, batch_size, memory_size, gamma, tau,
                 explore_steps, policy_update, policy_noise, explore_noise,
                 noise_clip, ctx):
        self.action_dim = action_dim
        self.action_bound = nd.array(action_bound, ctx=ctx)

        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.gamma = gamma
        self.tau = tau
        self.explore_steps = explore_steps
        self.policy_update = policy_update
        self.policy_noise = policy_noise
        self.explore_noise = explore_noise
        self.noise_clip = noise_clip
        self.ctx = ctx

        self.main_actor_network = Actor(action_dim, self.action_bound)
        self.target_actor_network = Actor(action_dim, self.action_bound)
        self.main_critic_network1 = Critic()
        self.target_critic_network1 = Critic()
        self.main_critic_network2 = Critic()
        self.target_critic_network2 = Critic()

        self.main_actor_network.collect_params().initialize(init=init.Xavier(),
                                                            ctx=ctx)
        self.target_actor_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)

        self.actor_optimizer = gluon.Trainer(
            self.main_actor_network.collect_params(), 'adam',
            {'learning_rate': self.actor_learning_rate})
        self.critic1_optimizer = gluon.Trainer(
            self.main_critic_network1.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})
        self.critic2_optimizer = gluon.Trainer(
            self.main_critic_network2.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})

        self.total_steps = 0
        self.total_train_steps = 0

        self.memory_buffer = MemoryBuffer(buffer_size=self.memory_size,
                                          ctx=ctx)
Beispiel #3
0
 def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001):
     """Initialization"""
     # Environment and A2C parameters
     self.act_dim = act_dim
     self.act_range = act_range
     self.env_dim = env_dim
     self.gamma = gamma
     self.lr = lr
     # Create actor and critic networks
     self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
     self.critic = Critic(self.env_dim, act_dim, lr, tau)
     self.buffer = MemoryBuffer(buffer_size)
Beispiel #4
0
class DoubleDQN:
    def __init__(self, n_action, init_epsilon, final_epsilon, gamma,
                 buffer_size, batch_size, replace_iter, annealing,
                 learning_rate, ctx):
        self.n_action = n_action
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.final_epsilon = final_epsilon
        # discount factor
        self.gamma = gamma
        # memory buffer size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        # replace the parameters of the target network every T time steps
        self.replace_iter = replace_iter
        # The number of step it will take to linearly anneal the epsilon to its min value
        self.annealing = annealing
        self.learning_rate = learning_rate
        self.ctx = ctx

        self.total_steps = 0
        self.replay_buffer = MemoryBuffer(self.buffer_size, ctx)  # use deque

        # build the network
        self.target_network = DoubleQNetwork(n_action)
        self.main_network = DoubleQNetwork(n_action)
        self.target_network.collect_params().initialize(
            init.Xavier(), ctx=ctx)  # initialize the params
        self.main_network.collect_params().initialize(init.Xavier(), ctx=ctx)

        # optimize the main network
        self.optimizer = gluon.Trainer(self.main_network.collect_params(),
                                       'adam',
                                       {'learning_rate': self.learning_rate})

    def choose_action(self, state):
        state = nd.array([state], ctx=self.ctx)
        if nd.random.uniform(0, 1) > self.epsilon:
            # choose the best action
            q_value = self.main_network(state)
            action = int(nd.argmax(q_value, axis=1).asnumpy())
        else:
            # random choice
            action = random.choice(range(self.n_action))
        # anneal
        self.epsilon = max(
            self.final_epsilon, self.epsilon -
            (self.init_epsilon - self.final_epsilon) / self.annealing)
        self.total_steps += 1
        return action

    def update(self):
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_buffer.sample(
            self.batch_size)
        with autograd.record():
            # get the Q(s,a)
            all_current_q_value = self.main_network(state_batch)
            main_q_value = nd.pick(all_current_q_value, action_batch)

            # different from DQN
            # get next action from main network, then get its Q value from target network
            all_next_q_value = self.target_network(
                next_state_batch).detach()  # only get gradient of main network
            max_action = nd.argmax(all_current_q_value, axis=1)
            target_q_value = nd.pick(all_next_q_value, max_action).detach()

            target_q_value = reward_batch + (
                1 - done_batch) * self.gamma * target_q_value

            # record loss
            loss = gloss.L2Loss()
            value_loss = loss(target_q_value, main_q_value)
        self.main_network.collect_params().zero_grad()
        value_loss.backward()
        self.optimizer.step(batch_size=self.batch_size)

    def replace_parameters(self):
        self.main_network.save_parameters('Double_DQN_temp_params')
        self.target_network.load_parameters('Double_DQN_temp_params')
        print('Double_DQN parameters replaced')

    def save_parameters(self):
        self.target_network.save_parameters(
            'Double_DQN_target_network_parameters')
        self.main_network.save_parameters('Double_DQN_main_network_parameters')

    def load_parameters(self):
        self.target_network.load_parameters(
            'Double_DQN_target_network_parameters')
        self.main_network.load_parameters('Double_DQN_main_network_parameters')
Beispiel #5
0
class DDPG:
    def __init__(self, action_dim, action_bound, actor_learning_rate,
                 critic_learning_rate, batch_size, memory_size, gamma, tau,
                 explore_steps, explore_noise, noise_clip, ctx):
        self.action_dim = action_dim
        self.action_bound = nd.array(action_bound, ctx=ctx)
        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.gamma = gamma
        self.tau = tau
        self.explore_steps = explore_steps
        self.explore_noise = explore_noise
        self.noise_clip = noise_clip
        self.ctx = ctx
        self.total_steps = 0

        self.memory_buffer = MemoryBuffer(self.memory_size, ctx=ctx)

        self.target_actor_network = ActorNetwork(self.action_dim,
                                                 self.action_bound)
        self.main_actor_network = ActorNetwork(self.action_dim,
                                               self.action_bound)
        self.target_critic_network = CriticNetwork()
        self.main_critic_network = CriticNetwork()

        self.target_actor_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_actor_network.collect_params().initialize(init=init.Xavier(),
                                                            ctx=ctx)
        self.main_critic_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)

        self.actor_optimizer = gluon.Trainer(
            self.main_actor_network.collect_params(), 'adam',
            {'learning_rate': self.actor_learning_rate})
        self.critic_optimizer = gluon.Trainer(
            self.main_critic_network.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})

    def choose_action_train(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        # no noise clip
        noise = nd.normal(loc=0,
                          scale=self.explore_noise,
                          shape=action.shape,
                          ctx=self.ctx)
        action += noise
        clipped_action = self.action_clip(action)
        return clipped_action

    def choose_action_evaluate(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        return action

    def action_clip(self, action):
        low_bound = [
            float(self.action_bound[i][0].asnumpy())
            for i in range(self.action_dim)
        ]
        high_bound = [
            float(self.action_bound[i][1].asnumpy())
            for i in range(self.action_dim)
        ]
        bound = list(zip(low_bound, high_bound))
        # clip and reshape
        action_list = [
            nd.clip(action[:, i], bound[i][0], bound[i][1]).reshape(-1, 1)
            for i in range(self.action_dim)
        ]
        # concat
        clipped_action = reduce(nd.concat, action_list)
        return clipped_action.squeeze()

    def soft_update(self, target_network, main_network):
        target_parameters = target_network.collect_params().keys()
        main_parameters = main_network.collect_params().keys()
        d = zip(target_parameters, main_parameters)
        for x, y in d:
            target_network.collect_params()[x].data()[:] = \
                target_network.collect_params()[x].data() * \
                (1 - self.tau) + main_network.collect_params()[y].data() * self.tau

    def update(self):
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory_buffer.sample(
            self.batch_size)

        # ---------------optimize critic------------------
        with autograd.record():
            next_action_batch = self.target_actor_network(next_state_batch)
            next_q = self.target_critic_network(next_state_batch,
                                                next_action_batch).squeeze()
            target_q = reward_batch + (1 - done_batch) * self.gamma * next_q

            current_q = self.main_critic_network(state_batch, action_batch)
            loss = gloss.L2Loss()
            value_loss = loss(target_q.detach(), current_q)
        self.main_critic_network.collect_params().zero_grad()
        value_loss.backward()
        self.critic_optimizer.step(self.batch_size)

        # ---------------optimize actor-------------------
        with autograd.record():
            pred_action_batch = self.main_actor_network(state_batch)
            actor_loss = -nd.mean(
                self.main_critic_network(state_batch, pred_action_batch))
        self.main_actor_network.collect_params().zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step(1)

        self.soft_update(self.target_actor_network, self.main_actor_network)
        self.soft_update(self.target_critic_network, self.main_critic_network)

    def save(self):
        self.main_actor_network.save_parameters(
            'DDPG Pendulum Main Actor.params')
        self.target_actor_network.save_parameters(
            'DDPG Pendulum Target Actor.params')
        self.main_critic_network.save_parameters(
            'DDPG Pendulum Main Critic.params')
        self.target_critic_network.save_parameters(
            'DDPG Pendulum Target Critic.params')

    def load(self):
        self.main_actor_network.load_parameters(
            'DDPG Pendulum Main Actor.params')
        self.target_actor_network.load_parameters(
            'DDPG Pendulum Target Actor.params')
        self.main_critic_network.load_parameters(
            'DDPG Pendulum  Main Critic.params')
        self.target_critic_network.load_parameters(
            'DDPG Pendulum Target Critic.params')
Beispiel #6
0
    def __init__(
            self,
            capacity_per_level=500000,
            warmup_steps=100000,
            n_frames=4,
            n_atoms=51,
            v_min=-1,
            v_max=0,
            gamma=.99,
            device='cuda',
            batch_size=48,
            lr=0.0000625 * 2,
            lr_decay=0.99,
            update_target_net_every=25000,
            train_every=6,
            frame_skip=4,
            disable_noisy_after=2000000,
            super_hexagon_path='C:\\Program Files (x86)\\Steam\\steamapps\\common\\Super Hexagon\\superhexagon.exe',
            run_afap=True):

        # training objects
        self.memory_buffer = MemoryBuffer(
            capacity_per_level,
            SuperHexagonInterface.n_levels,
            n_frames,
            SuperHexagonInterface.frame_size,
            SuperHexagonInterface.frame_size_cropped,
            gamma,
            device=device)
        self.net = Network(n_frames, SuperHexagonInterface.n_actions,
                           n_atoms).to(device)
        self.target_net = Network(n_frames, SuperHexagonInterface.n_actions,
                                  n_atoms).to(device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=lr,
                                          eps=1.5e-4)
        self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer, ExpLrDecay(lr_decay, min_factor=.1))

        # parameters
        self.batch_size = batch_size
        self.update_target_net_every = update_target_net_every
        self.train_every = train_every
        self.frame_skip = frame_skip
        self.disable_noisy_after = disable_noisy_after
        self.warmup_steps = warmup_steps
        self.gamma = gamma
        self.device = device

        # parameters for distributional
        self.n_atoms = n_atoms
        self.v_min = v_min
        self.v_max = v_max
        self.delta_z = (v_max - v_min) / (n_atoms - 1)
        self.support = torch.linspace(v_min,
                                      v_max,
                                      n_atoms,
                                      dtype=torch.float,
                                      device=device)
        self.offset = torch.arange(0,
                                   batch_size * n_atoms,
                                   n_atoms,
                                   device=device).view(-1, 1)
        self.m = torch.empty((batch_size, n_atoms), device=device)

        # debug and logging stuff
        self.list_steps_alive = [[]
                                 for _ in range(SuperHexagonInterface.n_levels)
                                 ]
        self.longest_run = [(0, 0)] * SuperHexagonInterface.n_levels
        self.total_simulated_steps = [0] * SuperHexagonInterface.n_levels
        self.losses = []
        self.kls = []
        self.times = []
        self.iteration = 0

        self.super_hexagon_path = super_hexagon_path
        self.run_afap = run_afap
Beispiel #7
0
class Trainer:
    def __init__(
            self,
            capacity_per_level=500000,
            warmup_steps=100000,
            n_frames=4,
            n_atoms=51,
            v_min=-1,
            v_max=0,
            gamma=.99,
            device='cuda',
            batch_size=48,
            lr=0.0000625 * 2,
            lr_decay=0.99,
            update_target_net_every=25000,
            train_every=6,
            frame_skip=4,
            disable_noisy_after=2000000,
            super_hexagon_path='C:\\Program Files (x86)\\Steam\\steamapps\\common\\Super Hexagon\\superhexagon.exe',
            run_afap=True):

        # training objects
        self.memory_buffer = MemoryBuffer(
            capacity_per_level,
            SuperHexagonInterface.n_levels,
            n_frames,
            SuperHexagonInterface.frame_size,
            SuperHexagonInterface.frame_size_cropped,
            gamma,
            device=device)
        self.net = Network(n_frames, SuperHexagonInterface.n_actions,
                           n_atoms).to(device)
        self.target_net = Network(n_frames, SuperHexagonInterface.n_actions,
                                  n_atoms).to(device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=lr,
                                          eps=1.5e-4)
        self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer, ExpLrDecay(lr_decay, min_factor=.1))

        # parameters
        self.batch_size = batch_size
        self.update_target_net_every = update_target_net_every
        self.train_every = train_every
        self.frame_skip = frame_skip
        self.disable_noisy_after = disable_noisy_after
        self.warmup_steps = warmup_steps
        self.gamma = gamma
        self.device = device

        # parameters for distributional
        self.n_atoms = n_atoms
        self.v_min = v_min
        self.v_max = v_max
        self.delta_z = (v_max - v_min) / (n_atoms - 1)
        self.support = torch.linspace(v_min,
                                      v_max,
                                      n_atoms,
                                      dtype=torch.float,
                                      device=device)
        self.offset = torch.arange(0,
                                   batch_size * n_atoms,
                                   n_atoms,
                                   device=device).view(-1, 1)
        self.m = torch.empty((batch_size, n_atoms), device=device)

        # debug and logging stuff
        self.list_steps_alive = [[]
                                 for _ in range(SuperHexagonInterface.n_levels)
                                 ]
        self.longest_run = [(0, 0)] * SuperHexagonInterface.n_levels
        self.total_simulated_steps = [0] * SuperHexagonInterface.n_levels
        self.losses = []
        self.kls = []
        self.times = []
        self.iteration = 0

        self.super_hexagon_path = super_hexagon_path
        self.run_afap = run_afap

    def warmup(self, game, log_every):
        t = True
        for i in range(1, self.warmup_steps + 1):
            if i % log_every == 0:
                print('Warmup', i)
            if t:
                self.total_simulated_steps[game.level] += game.simulated_steps
                if self.total_simulated_steps[
                        game.level] > self.total_simulated_steps[game.level -
                                                                 1]:
                    game.select_level((game.level + 1) % 6)
                f, fc = game.reset()
                self.memory_buffer.insert_first(game.level, f, fc)
            a = np.random.randint(0, 3)
            (f, fc), r, t = game.step(a)
            self.memory_buffer.insert(game.level, a, r, t, f, fc)
        return t

    def train(
        self,
        save_every=50000,
        save_name='trainer',
        log_every=1000,
    ):

        game = SuperHexagonInterface(self.frame_skip,
                                     self.super_hexagon_path,
                                     run_afap=self.run_afap,
                                     allow_game_restart=True)

        # if trainer was loaded, select the level that was played the least
        if any(x != 0 for x in self.total_simulated_steps):
            game.select_level(np.argmin(self.total_simulated_steps).item())

        # init state
        f, fc = np.zeros(game.frame_size,
                         dtype=np.bool), np.zeros(game.frame_size_cropped,
                                                  dtype=np.bool)
        sf, sfc = torch.zeros((1, 4, *game.frame_size),
                              device=self.device), torch.zeros(
                                  (1, 4, *game.frame_size_cropped),
                                  device=self.device)
        t = True

        # run warmup is necessary
        if self.iteration == 0:
            if os.path.exists('warmup_buffer.npz'):
                self.memory_buffer.load_warmup('warmup_buffer.npz')
            else:
                t = self.warmup(game, log_every)
                self.memory_buffer.save_warmup('warmup_buffer.npz')

        # trainings loop
        last_time = time()
        save_when_terminal = False
        while True:

            self.iteration += 1

            # disable noisy
            if self.iteration == self.disable_noisy_after:
                self.net.eval()
                self.target_net.eval()

            # log
            if self.iteration % log_every == 0 and all(
                    len(l) > 0 for l in self.list_steps_alive):
                print(
                    f'{self.iteration} | '
                    f'{[round(np.mean(np.array(l[-100:])[:, 1]) / 60, 2) for l in self.list_steps_alive]}s | '
                    f'{[round(r[1] / 60, 2) for r in self.longest_run]}s | '
                    f'{self.total_simulated_steps} | '
                    f'{time() - last_time:.2f}s | '
                    f'{np.mean(self.losses[-log_every:])} | '
                    f'{np.mean(self.kls[-log_every:])} | '
                    f'{self.lr_scheduler.get_last_lr()[0]} | '
                    f'{game.level}')

            # indicate that the trainer should be saved the next time the agent dies
            if self.iteration % save_every == 0:
                save_when_terminal = True

            # update target net
            if self.iteration % self.update_target_net_every == 0:
                self.lr_scheduler.step()
                self.target_net.load_state_dict(self.net.state_dict())

            # if terminal
            if t:
                # select next level if this level was played at least as long as the previous level
                if self.total_simulated_steps[
                        game.level] > self.total_simulated_steps[game.level -
                                                                 1]:
                    game.select_level((game.level + 1) % 6)
                f, fc = game.reset()
                self.memory_buffer.insert_first(game.level, f, fc)
                sf.zero_()
                sfc.zero_()

            # update state
            sf[0, 1:] = sf[0, :-1].clone()
            sfc[0, 1:] = sfc[0, :-1].clone()
            sf[0, 0] = torch.from_numpy(f).to(self.device)
            sfc[0, 0] = torch.from_numpy(fc).to(self.device)

            # train
            if self.iteration % self.train_every == 0:
                loss, kl = self.train_batch()
                self.losses.append(loss)
                self.kls.append(kl)

            # act
            with torch.no_grad():
                self.net.reset_noise()
                a = (self.net(sf, sfc) *
                     self.support).sum(dim=2).argmax(dim=1).item()
            (f, fc), r, t = game.step(a)
            self.memory_buffer.insert(game.level, a, r, t, f, fc)

            # if terminal
            if t:
                if game.steps_alive > self.longest_run[game.level][1]:
                    self.longest_run[game.level] = (self.iteration,
                                                    game.steps_alive)
                self.list_steps_alive[game.level].append(
                    (self.iteration, game.steps_alive))
                self.total_simulated_steps[game.level] += game.simulated_steps
                self.times.append(time() - last_time)

                if save_when_terminal:
                    print('saving...')
                    for _ in range(60):
                        game.game.step(False)
                    self.save(save_name)
                    for _ in range(60):
                        game.game.step(False)
                    save_when_terminal = False

    def train_batch(self):

        # sample minibatch
        f, fc, a, r, t, f1, fc1 = self.memory_buffer.make_batch(
            self.batch_size)

        # compute target q distribution
        with torch.no_grad():
            self.target_net.reset_noise()
            qdn = self.target_net(f1, fc1)
            an = (qdn * self.support).sum(dim=2).argmax(dim=1)

        Tz = (r.unsqueeze(1) +
              t.logical_not().unsqueeze(1) * self.gamma * self.support).clamp_(
                  self.v_min, self.v_max)
        b = (Tz - self.v_min) / self.delta_z
        l = b.floor().long()
        u = b.ceil().long()

        l[(u > 0) & (l == u)] -= 1
        u[(l == u)] += 1

        vdn = qdn.gather(
            1,
            an.view(-1, 1,
                    1).expand(self.batch_size, -1,
                              self.n_atoms)).view(self.batch_size,
                                                  self.n_atoms)
        self.m.zero_()
        self.m.view(-1).index_add_(0, (l + self.offset).view(-1),
                                   (vdn * (u - b)).view(-1))
        self.m.view(-1).index_add_(0, (u + self.offset).view(-1),
                                   (vdn * (b - l)).view(-1))

        # forward and backward pass
        qld = self.net(f, fc, log=True)
        vld = qld.gather(
            1,
            a.view(-1, 1,
                   1).expand(self.batch_size, -1,
                             self.n_atoms)).view(self.batch_size, self.n_atoms)
        loss = -torch.sum(self.m * vld, dim=1).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        kl = F.kl_div(vld.detach(), self.m, reduction='batchmean')
        return loss.detach().item(), kl.item()

    def save(self, file_name='trainer'):

        # first backup the last save file
        # in case anything goes wrong
        file_name_backup = file_name + '_backup'
        if os.path.exists(file_name):
            os.rename(file_name, file_name_backup)

        # save this object
        with open(file_name, 'wb') as f:
            pickle.dump(self, f)

        # remove backup if nothing went wrong
        if os.path.exists(file_name_backup):
            os.remove(file_name_backup)

    @staticmethod
    def load(file_name='trainer'):
        with open(file_name, 'rb') as f:
            ret = pickle.load(f)
            assert ret.memory_buffer.last_was_terminal
            return ret
Beispiel #8
0
class TD3:
    def __init__(self, action_dim, action_bound, actor_learning_rate,
                 critic_learning_rate, batch_size, memory_size, gamma, tau,
                 explore_steps, policy_update, policy_noise, explore_noise,
                 noise_clip, ctx):
        self.action_dim = action_dim
        self.action_bound = nd.array(action_bound, ctx=ctx)

        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.gamma = gamma
        self.tau = tau
        self.explore_steps = explore_steps
        self.policy_update = policy_update
        self.policy_noise = policy_noise
        self.explore_noise = explore_noise
        self.noise_clip = noise_clip
        self.ctx = ctx

        self.main_actor_network = Actor(action_dim, self.action_bound)
        self.target_actor_network = Actor(action_dim, self.action_bound)
        self.main_critic_network1 = Critic()
        self.target_critic_network1 = Critic()
        self.main_critic_network2 = Critic()
        self.target_critic_network2 = Critic()

        self.main_actor_network.collect_params().initialize(init=init.Xavier(),
                                                            ctx=ctx)
        self.target_actor_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)

        self.actor_optimizer = gluon.Trainer(
            self.main_actor_network.collect_params(), 'adam',
            {'learning_rate': self.actor_learning_rate})
        self.critic1_optimizer = gluon.Trainer(
            self.main_critic_network1.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})
        self.critic2_optimizer = gluon.Trainer(
            self.main_critic_network2.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})

        self.total_steps = 0
        self.total_train_steps = 0

        self.memory_buffer = MemoryBuffer(buffer_size=self.memory_size,
                                          ctx=ctx)

    def choose_action_train(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        # no noise clip
        noise = nd.normal(loc=0,
                          scale=self.explore_noise,
                          shape=action.shape,
                          ctx=self.ctx)
        action += noise
        clipped_action = self.action_clip(action)
        return clipped_action

    # when you test the agent, use this to choose action.
    def choose_action_evaluate(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        return action

    # after adding the noise to action, you need to clip it to restrain it between available action bound.
    # Maybe you have a better way to do it. I think i make it too complicated!!!!
    def action_clip(self, action):
        low_bound = [
            float(self.action_bound[i][0].asnumpy())
            for i in range(self.action_dim)
        ]
        high_bound = [
            float(self.action_bound[i][1].asnumpy())
            for i in range(self.action_dim)
        ]
        bound = list(zip(low_bound, high_bound))
        # clip and reshape
        action_list = [
            nd.clip(action[:, i], bound[i][0], bound[i][1]).reshape(-1, 1)
            for i in range(self.action_dim)
        ]
        # concat
        clipped_action = reduce(nd.concat, action_list)
        return clipped_action.squeeze()

    def soft_update(self, target_network, main_network):
        target_parameters = target_network.collect_params().keys()
        main_parameters = main_network.collect_params().keys()
        d = zip(target_parameters, main_parameters)
        for x, y in d:
            target_network.collect_params()[x].data()[:] = \
                target_network.collect_params()[x].data() * \
                (1 - self.tau) + main_network.collect_params()[y].data() * self.tau

    def update(self):
        self.total_train_steps += 1
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory_buffer.sample(
            self.batch_size)

        # --------------optimize the critic network--------------------
        with autograd.record():
            # choose next action according to target policy network
            next_action_batch = self.target_actor_network(next_state_batch)
            noise = nd.normal(loc=0,
                              scale=self.policy_noise,
                              shape=next_action_batch.shape,
                              ctx=self.ctx)
            # with noise clip
            noise = nd.clip(noise,
                            a_min=-self.noise_clip,
                            a_max=self.noise_clip)
            next_action_batch = next_action_batch + noise
            clipped_action = self.action_clip(next_action_batch)

            # get target q value
            target_q_value1 = self.target_critic_network1(
                next_state_batch, clipped_action)
            target_q_value2 = self.target_critic_network2(
                next_state_batch, clipped_action)
            target_q_value = nd.minimum(target_q_value1,
                                        target_q_value2).squeeze()
            target_q_value = reward_batch + (1.0 - done_batch) * (
                self.gamma * target_q_value)

            # get current q value
            current_q_value1 = self.main_critic_network1(
                state_batch, action_batch)
            current_q_value2 = self.main_critic_network2(
                state_batch, action_batch)
            loss = gloss.L2Loss()

            value_loss1 = loss(current_q_value1, target_q_value.detach())
            value_loss2 = loss(current_q_value2, target_q_value.detach())

        self.main_critic_network1.collect_params().zero_grad()
        value_loss1.backward()
        self.critic1_optimizer.step(self.batch_size)

        self.main_critic_network2.collect_params().zero_grad()
        value_loss2.backward()
        self.critic2_optimizer.step(self.batch_size)

        # ---------------optimize the actor network-------------------------
        if self.total_train_steps % self.policy_update == 0:
            with autograd.record():
                pred_action_batch = self.main_actor_network(state_batch)
                actor_loss = -nd.mean(
                    self.main_critic_network1(state_batch, pred_action_batch))

            self.main_actor_network.collect_params().zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step(1)

            self.soft_update(self.target_actor_network,
                             self.main_actor_network)
            self.soft_update(self.target_critic_network1,
                             self.main_critic_network1)
            self.soft_update(self.target_critic_network2,
                             self.main_critic_network2)

    def save(self):
        self.main_actor_network.save_parameters(
            'TD3 LunarLander main actor network.params')
        self.target_actor_network.save_parameters(
            'TD3 LunarLander target actor network.params')
        self.main_critic_network1.save_parameters(
            'TD3 LunarLander main critic network.params')
        self.main_critic_network2.save_parameters(
            'TD3 LunarLander main critic network.params')
        self.target_critic_network1.save_parameters(
            'TD3 LunarLander target critic network.params')
        self.target_critic_network2.save_parameters(
            'TD3 LunarLander target critic network.params')

    def load(self):
        self.main_actor_network.load_parameters(
            'TD3 LunarLander main actor network.params')
        self.target_actor_network.load_parameters(
            'TD3 LunarLander target actor network.params')
        self.main_critic_network1.load_parameters(
            'TD3 LunarLander main critic network.params')
        self.main_critic_network2.load_parameters(
            'TD3 LunarLander main critic network.params')
        self.target_critic_network1.load_parameters(
            'TD3 LunarLander target critic network.params')
        self.target_critic_network2.load_parameters(
            'TD3 LunarLander target critic network.params')
Beispiel #9
0
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class"""

    def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001):
        """Initialization"""
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = env_dim
        self.gamma = gamma
        self.lr = lr
        # Create actor and critic networks
        self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.critic = Critic(self.env_dim, act_dim, lr, tau)
        self.buffer = MemoryBuffer(buffer_size)

    def policy_action(self, s):
        """Use the actor to predict value"""
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """Use the Bellman Equation to compute the critic target"""
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer"""
        self.buffer.memorize(state, action, reward, done, new_state)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience"""
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions, np.array(
            grads).reshape((-1, self.act_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def train(self, env, summary_writer, nb_episodes=12, batch_size=64, render=False, gather_train_stats=False):
        results = []

        # First, gather experience
        tqdm_e = tqdm(range(nb_episodes),
                      desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.act_dim)
            while not done:
                if render:
                    env.render()
                # Actor picks an action (following the deterministic policy)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a+noise.generate(time), -
                            self.act_range, self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Add outputs to memory buffer
                self.memorize(old_state, a, r, done, new_state)
                # Sample experience from buffer
                states, actions, rewards, dones, new_states, _ = self.sample_batch(
                    batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict(
                    [new_states, self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1

            # Gather stats every episode for plotting
            if(gather_train_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tf_summary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()
            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        self.actor.save(path)
        self.critic.save(path)

    def load_weights(self, path_actor, path_critic):
        self.critic.load_weights(path_critic)
        self.actor.load_weights(path_actor)