class SAC(algorithms):
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = GaussianPolicy(state_dim, action_dim, 64,
                                    self.env.action_space).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(),
                                             self.args.lr)
        self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_1.load_state_dict(self.critic_1.state_dict())

        self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(),
                                             self.args.lr)
        self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target_2.load_state_dict(self.critic_2.state_dict())

        self.replay_buffer = ReplayBuffer(self.args.capacity)

        self.global_steps = 0

    def update(self):
        for it in range(self.args.update_iteration):
            # sample from replay buffer
            x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # get the next action and compute target Q
            with torch.no_grad():
                next_action, log_prob, _ = self.actor.sample(next_state)
                target_Q1 = self.critic_target_1(next_state, next_action)
                target_Q2 = self.critic_target_2(next_state, next_action)
                target_Q = torch.min(target_Q1,
                                     target_Q2) - self.args.alpha * log_prob
                y_Q = reward + self.args.gamma * (1 - done) * target_Q

            # update critic
            current_Q1 = self.critic_1(state, action)
            critic_loss1 = F.mse_loss(current_Q1, y_Q)
            self.critic_optimizer_1.zero_grad()
            critic_loss1.backward()
            self.critic_optimizer_1.step()

            current_Q2 = self.critic_2(state, action)
            critic_loss2 = F.mse_loss(current_Q2, y_Q)
            self.critic_optimizer_2.zero_grad()
            critic_loss2.backward()
            self.critic_optimizer_2.step()

            # update actor
            actor_action, actor_log_prob, _ = self.actor.sample(state)
            Q1 = self.critic_1(state, actor_action)
            Q2 = self.critic_2(state, actor_action)
            actor_loss = -(torch.min(Q1, Q2) -
                           self.args.alpha * actor_log_prob).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target network
            for param, target_param in zip(self.critic_1.parameters(),
                                           self.critic_target_1.parameters()):
                target_param.data.copy_((1 - self.args.tau) *
                                        target_param.data +
                                        self.args.tau * param.data)

            for param, target_param in zip(self.critic_2.parameters(),
                                           self.critic_target_2.parameters()):
                target_param.data.copy_((1 - self.args.tau) *
                                        target_param.data +
                                        self.args.tau * param.data)

    def train(self):
        for i in range(self.args.max_episode):
            state = self.env.reset()
            ep_r = 0
            for t in count():
                action, _, _ = self.actor.sample(
                    torch.FloatTensor([state]).to(device))
                action = action.cpu().detach().numpy()[0]
                next_state, reward, done, info = self.env.step(action)
                self.global_steps += 1
                ep_r += reward
                self.replay_buffer.push(
                    (state, next_state, action, reward, np.float(done)))
                state = next_state

                if done or t > self.args.max_length_trajectory:
                    if i % self.args.print_log == 0:
                        print(
                            "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}"
                            .format(i, ep_r, t, self.global_steps))
                        self.evaluate(10, False)
                    ep_r = 0
                    break

            if len(self.replay_buffer.storage) >= self.args.capacity - 1:
                self.update()

        self.save(i + 1)

    def evaluate(self, number=1, render=True):
        rewards = []
        for _ in range(number):
            state = self.env.reset()
            done = False
            total_rews = 0
            time_step = 0
            while not done:
                with torch.no_grad():
                    # use the mean action
                    action, _, _ = self.actor.sample(
                        torch.FloatTensor([state]).to(device))
                    action = action.cpu().detach().numpy()[0]
                if render:
                    self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_rews += reward
                time_step += 1

            if render:
                print("total reward of this episode is " + str(total_rews))
            rewards.append(total_rews)
        rewards = np.array(rewards)
        if not render:
            pickle.dump((self.global_steps, rewards), self.log_file)
        return rewards.max(), rewards.min(), rewards.mean()

    def save(self, episode):
        file_name = self.weights_file(episode)
        torch.save(
            {
                'actor': self.actor.state_dict(),
                'critic_1': self.critic_1.state_dict(),
                'critic_2': self.critic_2.state_dict(),
                'critic_target_1': self.critic_target_1.state_dict(),
                'critic_target_2': self.critic_target_2.state_dict()
            }, file_name)
        print("save model to " + file_name)

    def load(self, episode):
        file_name = self.weights_file(episode)
        checkpoint = torch.load(file_name)
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic_1.load_state_dict(checkpoint['critic_1'])
        self.critic_2.load_state_dict(checkpoint['critic_2'])
        self.critic_target_1.load_state_dict(checkpoint['critic_target_1'])
        self.critic_target_2.load_state_dict(checkpoint['critic_target_2'])
        print("successfully load model from " + file_name)
Exemple #2
0
class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.det_net = Network(features, self.h1, self.h2, actions).to(device)
        self.bpolicy_net = Network(features, self.h1, self.h2,
                                   actions).to(device)
        self.bpolicy_net.load_state_dict(
            torch.load(
                "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt"
            ))

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        q_s, _ = self.bpolicy_net(x)

        if q_s.shape[0] == 3:
            q_s = q_s.unsqueeze(0)
            #act = q_s.argmax().detach()

    # else:
        act = torch.max(q_s, 1).indices.detach().numpy()

        for i in range(act.shape[0]):
            action = act[i]
            if action == 1:
                if np.random.rand() < self.epsilon:
                    act[i] = np.random.choice([0, 2])

        # if act.cpu().numpy() == 1:
        #     if np.random.rand() < self.epsilon:
        #         a = np.random.randint(self.actions-1)

        # if np.random.rand() < self.epsilon:
        #     a = np.random.randint(self.actions)
        #     return torch.tensor(a, device=device)

        # # otherwise take a greedy action
        # q_s, _ = self.bpolicy_net(x)
        # # print(q_s)
        # return q_s.argmax().detach()
        act_tensor = torch.from_numpy(act).detach().to(device)

        return act_tensor

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, sp, r, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 200:
            samples, idcs = self.buffer.sample(200)
            self.updateNetwork(samples)
Exemple #3
0
class NAF:

    MODEL_NAME = "NAF"
    TARGET_MODEL_NAME = "target-NAF"

    class Build(Enum):
        SINGLE = 1
        MULTIPLE = 2
        HYDRA = 3

    def __init__(self,
                 prep,
                 build,
                 policy,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 buffer_size=10000,
                 batch_size=32,
                 steps_before_train=100,
                 train_freq=1,
                 num_steps=1000000,
                 learning_rate=1e-3,
                 update_rate=1e-3,
                 max_reward=None,
                 detailed_summary=False):

        self.prep = prep
        self.build_mode = build
        self.policy = policy
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detailed_summary = detailed_summary

        self.discount = 0.99
        self.learning_rate = learning_rate
        self.target_update_rate = update_rate
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.steps_before_train = steps_before_train
        self.train_freq = train_freq
        self.max_reward = max_reward
        self.max_iters = num_steps

        self.step = 0
        self.solved = False

        self.state_layers = [64, 32]

        self.mu_layers = [16, 8, self.action_dim]

        self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2]

        self.v_layers = [16, 8, 1]

        self.action_inputs = None
        self.reward_inputs = None
        self.done = None
        self.state_inputs = None
        self.state_outputs = None
        self.mu_outputs = None
        self.l_outputs = None
        self.value_outputs = None
        self.next_state_inputs = None
        self.next_state_outputs = None
        self.target_value_outputs = None
        self.target = None
        self.advantages = None
        self.q_values = None
        self.loss = None
        self.global_step = None
        self.inc_global_step = None
        self.train_op = None
        self.target_update = None

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.build()

        self.merged = tf.summary.merge_all()

        self.session = tf.Session()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "learning rate": self.learning_rate,
                "batch size": self.batch_size,
                "update rate": self.target_update_rate,
                "buffer size": self.buffer_size,
                "build": self.build_mode.name,
                "train frequency": self.train_freq
            })
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session.run(init_op)

    def build(self):
        self.action_inputs = tf.placeholder(tf.float32,
                                            (None, self.action_dim))
        self.reward_inputs = tf.placeholder(tf.float32, (None, ))
        self.done = tf.placeholder(tf.float32, (None, ))

        self.state_inputs, self.state_outputs, self.mu_outputs, self.l_outputs, self.value_outputs = \
          self.build_network(self.MODEL_NAME)

        self.next_state_inputs, self.next_state_outputs, _, _, self.target_value_outputs = \
          self.build_network(self.TARGET_MODEL_NAME)

        self.target = tf.expand_dims(self.reward_inputs, 1) + self.discount * (
            1 - tf.expand_dims(self.done, 1)) * self.target_value_outputs

        # taken from https://github.com/carpedm20/NAF-tensorflow/blob/master/src/network.py
        pivot = 0
        rows = []
        for idx in range(self.action_dim):
            count = self.action_dim - idx

            diag_elem = tf.exp(tf.slice(self.l_outputs, (0, pivot), (-1, 1)))
            non_diag_elems = tf.slice(self.l_outputs, (0, pivot + 1),
                                      (-1, count - 1))
            row = tf.pad(tf.concat((diag_elem, non_diag_elems), 1),
                         ((0, 0), (idx, 0)))
            rows.append(row)

            pivot += count

        L = tf.transpose(tf.stack(rows, axis=1), (0, 2, 1))
        P = tf.matmul(L, tf.transpose(L, (0, 2, 1)))

        adv_term = tf.expand_dims(self.action_inputs - self.mu_outputs, -1)
        self.advantages = -tf.matmul(tf.transpose(adv_term, [0, 2, 1]),
                                     tf.matmul(P, adv_term)) / 2
        self.advantages = tf.reshape(self.advantages, [-1, 1])

        self.q_values = self.advantages + self.value_outputs

        self.loss = tf.reduce_mean(
            architect.huber_loss(self.q_values -
                                 tf.stop_gradient(self.target)))

        tf.summary.scalar("training_loss", self.loss)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.inc_global_step = tf.assign(self.global_step,
                                         tf.add(self.global_step, 1))

        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.train_op = optimizer.minimize(self.loss)

        self.create_target_update_op()

    def build_network(self, name):

        detailed_summary = self.detailed_summary
        if name == self.TARGET_MODEL_NAME:
            detailed_summary = False

        with tf.variable_scope(name):

            state_inputs = tf.placeholder(tf.float32,
                                          shape=(None, self.state_dim))

            if self.build_mode == self.Build.SINGLE:
                state_outputs = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="state_branch",
                    detailed_summary=detailed_summary)
                mu_outputs = architect.dense_block(
                    state_outputs, [self.mu_layers[-1]],
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    state_outputs, [self.l_layers[-1]],
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    state_outputs, [self.v_layers[-1]],
                    "value_branch",
                    detailed_summary=detailed_summary)
            elif self.build_mode == self.Build.MULTIPLE:
                state_outputs = None
                mu_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="mu_state",
                    detailed_summary=detailed_summary)
                l_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="l_state",
                    detailed_summary=detailed_summary)
                value_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="value_state",
                    detailed_summary=detailed_summary)

                mu_outputs = architect.dense_block(
                    mu_state, [self.mu_layers[-1]],
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    l_state, [self.l_layers[-1]],
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    value_state, [self.v_layers[-1]],
                    "value_branch",
                    detailed_summary=detailed_summary)
            elif self.build_mode == self.Build.HYDRA:
                state_outputs = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="state_branch",
                    detailed_summary=detailed_summary)
                mu_outputs = architect.dense_block(
                    state_outputs,
                    self.mu_layers,
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    state_outputs,
                    self.l_layers,
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    state_outputs,
                    self.v_layers,
                    "value_branch",
                    detailed_summary=detailed_summary)
            else:
                raise ValueError("Wrong build type.")

            return state_inputs, state_outputs, mu_outputs, l_outputs, value_outputs

    def create_target_update_op(self):
        # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py
        net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope=self.MODEL_NAME)
        target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                            scope=self.TARGET_MODEL_NAME)

        self.target_update = []
        for v_source, v_target in zip(net_vars, target_net_vars):
            # this is equivalent to target = (1-alpha) * target + alpha * source
            update_op = v_target.assign_sub(self.target_update_rate *
                                            (v_target - v_source))
            self.target_update.append(update_op)

        self.target_update = tf.group(*self.target_update)

    def learn(self):
        # learn
        batch = self.buffer.sample(self.batch_size)

        merged, targets, _ = self.session.run(
            [self.merged, self.target, self.train_op],
            feed_dict={
                self.state_inputs: batch["states"],
                self.action_inputs: batch["actions"],
                self.reward_inputs: batch["rewards"],
                self.next_state_inputs: batch["next_states"],
                self.done: batch["done"]
            })

        self.summary_writer.add_summary(merged, global_step=self.step)

        # target update
        self.session.run(self.target_update)

    def run_episode(self, env):

        self.policy.reset()

        state = env.reset()
        state, skip = self.prep.process(state)

        total_reward = 0

        while True:
            # play
            if skip:
                action = env.action_space.sample()
            else:
                action = self.session.run(self.mu_outputs,
                                          feed_dict={self.state_inputs:
                                                     state})[0]
                action = self.policy.add_noise(action)

            tmp_state = state
            tmp_skip = skip

            state, reward, done, _ = env.step(action)
            state, skip = self.prep.process(state)

            total_reward += reward

            if not tmp_skip and not tmp_skip:
                self.buffer.add({
                    "state": tmp_state[0],
                    "action": action,
                    "reward": reward,
                    "next_state": state[0],
                    "done": int(done)
                })

            if self.step >= self.steps_before_train and not self.solved:
                # learn
                for _ in range(self.train_freq):
                    self.learn()
                    _, self.step = self.session.run(
                        [self.inc_global_step, self.global_step])
            else:
                _, self.step = self.session.run(
                    [self.inc_global_step, self.global_step])

            if done:
                break

        summary_value = summary_pb2.Summary.Value(tag="episode_reward",
                                                  simple_value=total_reward)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=self.step)

        if self.max_reward is not None:
            if total_reward >= self.max_reward:
                self.solved = True
            else:
                self.solved = False

        if self.step == self.max_iters:
            self.saver.save(self.session,
                            self.summary_dir,
                            global_step=self.step)

        return total_reward, self.step

    def close(self):
        self.session.close()
class DQN(BaseAgent):
    def __init__(self, features, actions, state_array, params):
        super(DQN, self).__init__(features, actions, params)
        self.buffer_BACK = ReplayBuffer(1000)
        self.buffer_STAY = ReplayBuffer(1000)
        self.buffer_FORWARD = ReplayBuffer(1000)

        self.back_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.back_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.back_q_net.cloneWeightsTo(self.back_target_q_net)

        self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.stay_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)

        self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.forward_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))

        self.back_values = []
        self.stay_values = []
        self.forward_values = []

        self.back_values_baseline = []
        self.stay_values_baseline = []
        self.forward_values_baseline = []

        self.td_loss = []
        self.state_array = state_array
        self.penultimate_features = []

        self.ratioMap = params['ratioMap']
        self.sampleSize = params['sampleSize']

    def updateNetwork(self, samples):
        # organize the mini-batch so that we can request "columns" from the data
        # e.g. we can get all of the actions, or all of the states with a single call
        batch = getBatchColumns(samples)

        # compute Q(s, a) for each sample in mini-batch
        Qs, x = self.policy_net(batch.states)
        Qsa = Qs.gather(1, batch.actions).squeeze()

        self.penultimate_features.append(x)

        # by default Q(s', a') = 0 unless the next states are non-terminal

        Qspap = torch.zeros(batch.size, device=device)
        # for i in range(len(batch.actions.numpy())):
        #     if batch.actions.numpy()[i][0] == 0:
        #         self.back_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 1:
        #         self.stay_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 2:
        #         self.forward_values.append(Qsa.detach().numpy()[i])

        # if we don't have any non-terminal next states, then no need to bootstrap
        if batch.nterm_sp.shape[0] > 0:
            Qsp, _ = self.target_net(batch.nterm_sp)

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        self.optimizer.zero_grad()
        self.target_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        self.td_loss.append(td_loss.detach().numpy())

        # self.td_loss = self.td_loss + list(td_loss.detach().numpy())

        Qs_state_array, _ = self.policy_net(self.state_array)

        Qsa_mean_states = torch.mean(Qs_state_array, 0)

        self.back_values.append(Qsa_mean_states[0].detach().numpy())
        self.stay_values.append(Qsa_mean_states[1].detach().numpy())
        self.forward_values.append(Qsa_mean_states[2].detach().numpy())

        # update the *policy network* using the combined gradients
        self.optimizer.step()

    def updateActionNet(self, samples, q_net, target_q_net, optimizer, storeList):
        batch = getBatchColumns(samples)
        Qs, x = q_net(batch.states)

        # Qsa = Qs.squeeze()
        # for i in range(len(batch.actions)):
        #     storeList.append(Qsa.detach().numpy()[i])
        Qspap = torch.zeros(batch.size, device=device)

        ############  ============  CHECK ================= ###############################
        if batch.nterm_sp.shape[0] > 0:
            ##  Qsp, _ = target_q_net(batch.nterm_sp) #### Is this correct ????

            Qsp_back, _ = self.back_target_q_net(batch.nterm_sp)
            Qsp_stay, _ = self.stay_target_q_net(batch.nterm_sp)
            Qsp_forward, _ = self.forward_target_q_net(batch.nterm_sp)

            Qsp = torch.hstack([Qsp_back, Qsp_stay, Qsp_forward])

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        ############  ============  CHECK ================= ###############################
        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        optimizer.zero_grad()
        target_q_net.zero_grad()
        self.back_target_q_net.zero_grad()
        self.stay_target_q_net.zero_grad()
        self.forward_target_q_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        Qs_state_array, _ = q_net(self.state_array)
        Qsa_mean_states = torch.mean(Qs_state_array, 0)
        storeList.append(Qsa_mean_states[0].detach().numpy())

        # update the *policy network* using the combined gradients
        optimizer.step()

    def update(self, s, a, sp, r, gamma):
        if a.cpu().numpy() == 0:
            self.buffer_BACK.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 1:
            self.buffer_STAY.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 2:
            self.buffer_FORWARD.add((s, a, sp, r, gamma))

        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)
            self.back_q_net.cloneWeightsTo(self.back_target_q_net)
            self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)
            self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        back_sample_count = math.floor(
            self.ratioMap.backward_ratio * self.sampleSize)
        stay_sample_count = math.floor(
            self.ratioMap.stay_ratio * self.sampleSize)
        forward_sample_count = math.floor(
            self.ratioMap.forward_ratio * self.sampleSize)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer_BACK) > back_sample_count \
                and len(self.buffer_STAY) > stay_sample_count \
                and len(self.buffer_FORWARD) > forward_sample_count:

            samplesBack, idcs = self.buffer_BACK.sample(back_sample_count)
            samplesStay, idcs = self.buffer_STAY.sample(stay_sample_count)
            samplesForward, idcs = self.buffer_FORWARD.sample(forward_sample_count)
            self.updateActionNet(samplesBack, self.back_q_net, self.back_target_q_net, self.optimizerBack,
                                 self.back_values_baseline)
            self.updateActionNet(samplesStay, self.stay_q_net, self.stay_target_q_net, self.optimizerStay,
                                 self.stay_values_baseline)
            self.updateActionNet(samplesForward, self.forward_q_net, self.forward_target_q_net, self.optimizerForward,
                                 self.forward_values_baseline)
            samples = samplesBack + samplesStay + samplesForward

            self.updateNetwork(samples)
Exemple #5
0
class DDPG():
	def __init__(self, args, env = None):
		self.args = args
		# actor
		self.actor = DeterministicPolicy(128).to(device)
		self.actor_target = DeterministicPolicy(128).to(device)
		self.actor_target.load_state_dict(self.actor.state_dict())
		self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr)
		# critics
		self.critic = QNetwork(128).to(device)
		self.critic_target = QNetwork(128).to(device)
		self.critic_target.load_state_dict(self.critic.state_dict())
		self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr)

		self.replay_buffer = ReplayBuffer(self.args.capacity)
		self.num_critic_update_iteration = 0
		self.num_actor_update_iteration = 0
		self.num_training = 0
		self.global_steps = 0

		self.action_scale = torch.FloatTensor([[20, 1]]).to(device)
		self.env = env
		#self.load()

	def update(self):
		for it in range(self.args.update_iteration):
			# sample from replay buffer
			obs, local_goal, next_obs, next_goal, action, reward, done = self.replay_buffer.sample(self.args.batch_size)
			obs = torch.FloatTensor(obs).to(device)
			local_goal = torch.FloatTensor(local_goal).to(device)
			next_obs = torch.FloatTensor(next_obs).to(device)
			next_goal = torch.FloatTensor(next_goal).to(device)
			action = torch.FloatTensor(action).to(device)
			reward = torch.FloatTensor(reward).to(device)
			done = torch.FloatTensor(done).to(device)

			# computer the target Q value
			next_action, _ = self.actor_target.sample(next_obs, next_goal)
			target_Q = self.critic_target(next_obs, next_goal, next_action / self.action_scale)
			target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach()

			# get current Q estimate
			current_Q = self.critic(obs, local_goal, action)

			# compute cirtic loss and update
			critic_loss = F.mse_loss(current_Q, target_Q)
			self.critic_optimizer.zero_grad()
			critic_loss.backward()
			self.critic_optimizer.step()

			# computer actor loss
			actor_action, _ = self.actor.sample(obs, local_goal)
			actor_loss = -self.critic(obs, local_goal, actor_action / self.action_scale).mean()
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()

			# update target model 
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data)

			self.num_actor_update_iteration += 1
			self.num_critic_update_iteration += 1

	def train(self):
		for i in range(self.args.max_episode):
			obs, local_goal = self.env.reset()
			ep_r = 0

			for t in count():
				action, _ = self.actor.sample(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device))
				action = action.cpu().detach().numpy()[0]

				next_obs, next_goal, done, reward = self.env.step(action)
				self.global_steps += 1
				ep_r += reward
				self.replay_buffer.push((obs / 4.0, local_goal / 20., next_obs / 4.0, next_goal / 20., action / np.array([20, 1]), reward, np.float(done)))
				obs = next_obs
				local_goal = next_goal

				if done or t > self.args.max_length_trajectory:
					if i % self.args.print_log == 0:
						print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps))
						self.evaluate(10, False)
					break

			if len(self.replay_buffer.storage) >= self.args.capacity * 0.2:
				self.update()

		self.save()

	def evaluate(self, number = 1, render = True):
		rewards = []
		for _ in range(number):
			total_rews = 0
			time_step = 0
			done = False
			obs, local_goal = self.env.reset()
			while not done:
				action = self.predict(obs / 4., local_goal / 20.)
				# with torch.no_grad():
				# 	# use the mean action
				# 	_, action = self.actor.sample(torch.FloatTensor(obs).to(device) / 4., torch.FloatTensor(local_goal).to(device) / 20)
				# 	action = action.cpu().detach().numpy()[0]

				obs, local_goal, done, reward = self.env.step(action)
				
				if render:
					self.env.render()
				total_rews += reward
				time_step += 1
				if time_step > self.args.max_length_trajectory:
					break
				#print(str(action) + "  " + str(local_goal))
				if done:
					break

			rewards.append(total_rews)
		rewards = np.array(rewards)
		print("mean reward {}, max reward {}, min reward {}".format(rewards.mean(), rewards.max(), rewards.min()))

	def predict(self, obs, local_goal):
		with torch.no_grad():
			action = self.actor.forward(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device))
		action = action.cpu().detach().numpy()[0]
		return action

	def load(self, episode = None):
		file_name = "weights/DDPG.pt"
		checkpoint = torch.load(file_name)
		self.actor.load_state_dict(checkpoint['actor'])
		self.actor_target.load_state_dict(checkpoint['actor_target'])
		self.critic.load_state_dict(checkpoint['critic'])
		self.critic.load_state_dict(checkpoint['critic_target'])
		print("successfully load model from " + file_name)

	def save(self, episode = None):
		file_name = "weights/DDPG.pt"
		torch.save({'actor' : self.actor.state_dict(),
					'critic' : self.critic.state_dict(),
					'actor_target' : self.actor_target.state_dict(),
					'critic_target' : self.critic_target.state_dict()}, file_name)
		print("save model to " + file_name)
Exemple #6
0
class DDPG_Agent(Agent):
    """Interacts with and learns from the environment."""
    policy_type = "DDPG"

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = DDPG_Actor(state_size, action_size,
                                      random_seed).to(device)
        self.actor_target = DDPG_Actor(state_size, action_size,
                                       random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPG_Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_target = DDPG_Critic(state_size, action_size,
                                         random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        #Statistics
        self.stats = {
            "actor_loss": [],
            "critic_loss": [],
            "reward_sum": [],
        }

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        action = self.actor_local.select_action(state)
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        #tmp = np.array((critic_loss.item(), actor_loss.item()))
        #print(tmp)
        # --------------------------- for the plot ----------------------------- #

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        with torch.no_grad():
            actions_pred_target = self.actor_target(states)
            actor_loss_target = -self.critic_target(
                states, actions_pred_target).mean()
            Q_expected_target = self.critic_target(states, actions)
            critic_loss_target = F.mse_loss(Q_expected_target, Q_targets)
            with open("saveDDPG_critic-actor_loss.csv", "a") as f:
                tmp = str(critic_loss_target.item()) + "," + str(
                    actor_loss_target.item()) + "\n"
                f.write(tmp)
            self.save_stats(actor_loss=actor_loss.item(),
                            critic_loss=critic_loss.item(),
                            reward_sum=rewards.sum().item())

    def store_policy(self, env_name, score):
        traced = torch.jit.script(self.actor_target)
        torch.jit.save(
            traced, "data/policies/" + "DDPGAgent" + str(env_name) + "#" +
            str(score) + ".zip")

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class BaseAgent:
    def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector):
        self.features = features
        self.actions = actions
        self.params = params
        self.collector = collector
        self.seed = seed

        # define parameter contract
        self.gamma = params['gamma']
        self.epsilon = params.get('epsilon', 0)
        # the mellowmax parameter
        self.omega = params.get('omega', 1.0)

        # set up network for estimating Q(s, a)
        self.value_net = Network(features, actions, params, seed).to(device)

        # build the optimizer
        self.optimizer_params = params['optimizer']
        self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params)

        self.steps = 0

        # set up the replay buffer
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch']
        self.buffer_type = params.get('buffer', 'standard')

        if self.buffer_type == 'per':
            prioritization = params['prioritization']
            self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization)
        else:
            self.buffer = ReplayBuffer(self.buffer_size)

        # build a target network
        self.target_refresh = params.get('target_refresh', 1)
        self.target_net = copy.deepcopy(self.value_net)
        self.initializeTargetNet()

        def getValues(x: torch.Tensor):
            qs = self.values(x).detach().cpu().squeeze(0).numpy()
            return qs

        self.policy = createEpsilonGreedy(seed, self.epsilon, getValues)

    # return the Q(s, a) values from the value network
    def values(self, x):
        return self.value_net(x)[0]

    # sample an action according to our policy
    def selectAction(self, x):
        return self.policy.selectAction(x)

    def initializeTargetNet(self):
        # if we aren't using target nets, then save some compute
        if self.target_refresh > 1:
            self.target_net = copy.deepcopy(self.value_net)
            cloneNetworkWeights(self.value_net, self.target_net)
        else:
            self.target_net = self.value_net

    @abstractmethod
    def updateNetwork(self, batch: Batch, predictions: Dict):
        pass

    @abstractmethod
    def forward(self, batch: Batch) -> Dict[str, torch.Tensor]:
        pass

    @abstractmethod
    def bootstrap(self, batch: Batch, next_values: torch.Tensor) -> Dict[str, torch.Tensor]:
        pass

    # a helper method that lets us bypass combining gradients whenever
    # target networks are disabled
    def combineTargetGrads(self):
        if self.target_net == self.value_net:
            return

        addGradients_(self.value_net, self.target_net)

    def update(self, s, a, sp, r, gamma):
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        if self.steps % self.target_refresh == 0 and self.target_refresh > 1:
            cloneNetworkWeights(self.value_net, self.target_net)

        if len(self.buffer) > self.batch_size + 1:
            samples, idcs = self.buffer.sample(self.batch_size)
            batch = getBatchColumns(samples)
            predictions = self.forward(batch)
            tde = self.updateNetwork(batch, predictions)

            self.buffer.update_priorities(idcs, tde)
Exemple #8
0
class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))
        # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0
        self.actionCounter = np.zeros((env.width, env.height, env.num_actions))

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        if np.random.rand() < self.epsilon:
            a = np.random.randint(self.actions)
            return torch.tensor(a, device=device)

        # otherwise take a greedy action
        q_s, _ = self.policy_net(x)
        # print(q_s.detach().numpy()[0][3])
        print(q_s.argmax().detach())

        return q_s.argmax().detach()

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, r, sp, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, r, sp, gamma))
        self.steps += 1
        a = a.numpy()
        s = s.numpy()

        self.actionCounter[s[0][0]][s[0][1]][a] += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 32:
            samples, idcs = self.buffer.sample(32)
            self.updateNetwork(samples)
Exemple #9
0
class DDPG(algorithms):
    def __init__(self, args):
        super().__init__(args)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        self.actor = DeterministicPolicy(state_dim, action_dim, 64,
                                         self.env.action_space).to(device)
        self.actor_target = DeterministicPolicy(
            state_dim, action_dim, 64, self.env.action_space).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          self.args.lr)

        self.critic = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target = QNetwork(state_dim, action_dim, 64).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           self.args.lr)

        self.replay_buffer = ReplayBuffer(self.args.capacity)
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0
        self.global_steps = 0

        if self.args.last_episode > 0:
            self.load(self.args.last_episode)

    def update(self):
        for it in range(self.args.update_iteration):
            # sample from replay buffer
            x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # computer the target Q value
            next_action, _, _ = self.actor_target.sample(next_state)
            target_Q = self.critic_target(next_state, next_action)
            target_Q = reward + (
                (1 - done) * self.args.gamma * target_Q).detach()

            # get current Q estimate
            current_Q = self.critic(state, action)

            # compute cirtic loss and update
            critic_loss = F.mse_loss(current_Q, target_Q)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # computer actor loss
            actor_action, _, _ = self.actor.sample(state)
            actor_loss = -self.critic(state, actor_action).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # update target model
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.args.tau * param.data +
                                        (1 - self.args.tau) *
                                        target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.args.tau * param.data +
                                        (1 - self.args.tau) *
                                        target_param.data)

            self.num_actor_update_iteration += 1
            self.num_critic_update_iteration += 1

    def train(self):
        for i in range(self.args.max_episode):
            state = self.env.reset()
            ep_r = 0
            for t in count():
                action, _, _ = self.actor.sample(
                    torch.FloatTensor([state]).to(device))
                action = action.cpu().detach().numpy()[0]

                next_state, reward, done, info = self.env.step(action)
                self.global_steps += 1
                ep_r += reward
                self.replay_buffer.push(
                    (state, next_state, action, reward, np.float(done)))
                state = next_state

                if done or t > self.args.max_length_trajectory:
                    if i % self.args.print_log == 0:
                        print(
                            "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}"
                            .format(i, ep_r, t, self.global_steps))
                        self.evaluate(10, False)
                    break

            if len(self.replay_buffer.storage) >= self.args.capacity - 1:
                self.update()
        self.save(i + 1)

    def evaluate(self, number=1, render=True):
        rewards = []
        for _ in range(number):
            total_rews = 0
            time_step = 0
            done = False
            state = self.env.reset()
            while not done:
                with torch.no_grad():
                    # use the mean action
                    _, _, action = self.actor.sample(
                        torch.FloatTensor([state]).to(device))
                    action = action.cpu().detach().numpy()[0]
                if render:
                    self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_rews += reward
                time_step += 1

            if render:
                print("total reward of this episode is " + str(total_rews))
            rewards.append(total_rews)
        rewards = np.array(rewards)
        if not render:
            pickle.dump((self.global_steps, rewards), self.log_file)
        print("mean reward {}, max reward {}".format(rewards.mean(),
                                                     rewards.max()))

    def load(self, episode=None):
        file_name = self.weights_file(episode)
        checkpoint = torch.load(file_name)
        self.actor.load_state_dict(checkpoint['actor'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.critic.load_state_dict(checkpoint['critic_target'])
        print("successfully load model from " + file_name)

    def save(self, episode=None):
        file_name = self.weights_file(episode)
        torch.save(
            {
                'actor': self.actor.state_dict(),
                'critic': self.critic.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, file_name)
        print("save model to " + file_name)
Exemple #10
0
class DDPG:

    CRITIC_NAME = "critic"
    TARGET_CRITIC_NAME = "target_critic"

    ACTOR_NAME = "actor"
    TARGET_ACTOR_NAME = "target_actor"

    def __init__(self,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 actor_learning_rate=1e-5,
                 critic_learning_rate=1e-3,
                 critic_target_update_rate=1e-3,
                 actor_target_update_rate=1e-3,
                 discount=0.99,
                 l2_decay=1e-2,
                 buffer_size=1000000,
                 batch_size=64,
                 detail_summary=False,
                 tanh_action=True,
                 input_batch_norm=True,
                 all_batch_norm=True,
                 log_frequency=10):

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.critic_learning_rate = critic_learning_rate
        self.actor_learning_rate = actor_learning_rate
        self.critic_target_update_rate = critic_target_update_rate
        self.actor_target_update_rate = actor_target_update_rate
        self.discount = discount
        self.batch_size = batch_size
        self.l2_decay = l2_decay
        self.buffer_size = buffer_size
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detail_summary = detail_summary
        self.tanh_action = tanh_action
        self.input_batch_norm = input_batch_norm
        self.all_batch_norm = all_batch_norm
        self.log_frequency = log_frequency

        self.step = 0
        self.solved = False

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.__build()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "actor learning rate": self.actor_learning_rate,
                "critic learning rate": self.critic_learning_rate,
                "batch size": self.batch_size,
                "actor update rate": self.actor_target_update_rate,
                "critic update rate": self.critic_target_update_rate,
                "buffer size": self.buffer_size,
            })

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session = tf.Session()

        self.merged = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.session.run(init_op)

    """
  PUBLIC
  """

    def learn(self):

        batch = self.buffer.sample(self.batch_size)
        self.__train_critic(batch["states"], batch["actions"],
                            batch["rewards"], batch["next_states"],
                            batch["done"])
        self.__train_actor(batch["states"])

        self.session.run([
            self.target_critic_update, self.target_actor_update,
            self.inc_global_step
        ])

    def act(self, state):
        a = self.session.run(self.action,
                             feed_dict={
                                 self.state_input: state,
                                 self.is_training: False
                             })[0]
        return a

    def perceive(self, transition):
        self.buffer.add(transition)

    def log_scalar(self, name, value, index):
        summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=index)

    def save(self):
        self.saver.save(self.session,
                        self.summary_dir,
                        global_step=self.session.run(self.global_step))

    def close(self):
        self.session.close()

    """
  PRIVATE
  """

    def __build_critic(self, name, state_input, action_input):

        bn_training = self.is_training
        if name == self.TARGET_CRITIC_NAME:
            bn_training = False

        with tf.variable_scope(name):

            # weights and biases
            W1 = self.__get_weights((self.state_dim, 400),
                                    self.state_dim,
                                    name="W1")
            b1 = self.__get_weights((400, ), self.state_dim, name="b1")

            W2 = self.__get_weights((400, 300),
                                    400 + self.action_dim,
                                    name="W2")
            b2 = self.__get_weights((300, ), 400 + self.action_dim, name="b2")

            W2_action = self.__get_weights((self.action_dim, 300),
                                           400 + self.action_dim,
                                           name="W2_action")

            W3 = tf.Variable(tf.random_uniform((300, 1), -3e-3, 3e-3),
                             name="W3")
            b3 = tf.Variable(tf.random_uniform((1, ), -3e-3, 3e-3), name="b3")

            # layers
            if self.input_batch_norm:
                state_input = tf.layers.batch_normalization(
                    state_input, training=bn_training)

            layer_1 = tf.matmul(state_input, W1) + b1

            if self.all_batch_norm:
                layer_1 = tf.layers.batch_normalization(layer_1,
                                                        training=bn_training)

            layer_1 = tf.nn.relu(layer_1)

            layer_2 = tf.nn.relu(
                tf.matmul(layer_1, W2) + tf.matmul(action_input, W2_action) +
                b2)

            output_layer = tf.matmul(layer_2, W3) + b3

            # summary
            if name == self.CRITIC_NAME:
                self.critic_summaries = [
                    tf.summary.histogram("W1", W1),
                    tf.summary.histogram("b1", b1),
                    tf.summary.histogram("W2", W2),
                    tf.summary.histogram("b2", b2),
                    tf.summary.histogram("W2_action", W2_action),
                    tf.summary.histogram("W3", W3),
                    tf.summary.histogram("b3", b3),
                    tf.summary.histogram("layer_1", layer_1),
                    tf.summary.histogram("layer_2", layer_2),
                    tf.summary.histogram("output_layer", output_layer)
                ]

            # weight decay
            weights = [W1, b1, W2, b2, W2_action, W3, b3]
            weight_decay = tf.add_n(
                [self.l2_decay * tf.nn.l2_loss(var) for var in weights])

            return output_layer, weight_decay

    def __build_actor(self, name, state_input):

        bn_training = self.is_training
        if name == self.TARGET_ACTOR_NAME:
            bn_training = False

        with tf.variable_scope(name):

            # weights and biases
            W1 = self.__get_weights((self.state_dim, 400),
                                    self.state_dim,
                                    name="W1")
            b1 = self.__get_weights((400, ), self.state_dim, name="b1")

            W2 = self.__get_weights((400, 300), 400, name="W2")
            b2 = self.__get_weights((300, ), 400, name="b2")

            W3 = tf.Variable(tf.random_uniform((300, self.action_dim),
                                               minval=-3e-3,
                                               maxval=3e-3),
                             name="W3")
            b3 = tf.Variable(tf.random_uniform((self.action_dim, ), -3e-3,
                                               3e-3),
                             name="b3")

            # layers
            if self.input_batch_norm:
                state_input = tf.layers.batch_normalization(
                    state_input, training=bn_training)

            layer_1 = tf.matmul(state_input, W1) + b1

            if self.all_batch_norm:
                layer_1 = tf.layers.batch_normalization(layer_1,
                                                        training=bn_training)

            layer_1 = tf.nn.relu(layer_1)

            layer_2 = tf.matmul(layer_1, W2) + b2

            if self.all_batch_norm:
                layer_2 = tf.layers.batch_normalization(layer_2,
                                                        training=bn_training)

            layer_2 = tf.nn.relu(layer_2)

            output_layer = tf.matmul(layer_2, W3) + b3

            # summary
            if name == self.ACTOR_NAME:
                self.actor_summaries = [
                    tf.summary.histogram("W1", W1),
                    tf.summary.histogram("b1", b1),
                    tf.summary.histogram("W2", W2),
                    tf.summary.histogram("b2", b2),
                    tf.summary.histogram("W3", W3),
                    tf.summary.histogram("b3", b3),
                    tf.summary.histogram("layer_1", layer_1),
                    tf.summary.histogram("layer_2", layer_2),
                    tf.summary.histogram("output_layer", output_layer)
                ]

            if self.tanh_action:
                return tf.nn.tanh(output_layer)
            else:
                return output_layer

    def __build(self):

        self.state_input = tf.placeholder(tf.float32,
                                          shape=(None, self.state_dim),
                                          name="state_input")
        self.next_state_input = tf.placeholder(tf.float32,
                                               shape=(None, self.state_dim),
                                               name="next_state_input")
        self.action_input = tf.placeholder(tf.float32,
                                           shape=(None, self.action_dim),
                                           name="action_input")
        self.reward_input = tf.placeholder(tf.float32,
                                           shape=(None, ),
                                           name="reward_input")
        self.done_input = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name="done_input")
        self.is_training = tf.placeholder(tf.bool, name="is_training")

        # inputs summary
        if self.detail_summary:
            self.input_summaries = [
                tf.summary.histogram("state", self.state_input),
                tf.summary.histogram("next_state", self.next_state_input),
                tf.summary.histogram("action", self.action_input),
                tf.summary.histogram("reward", self.reward_input),
                tf.summary.histogram("done", self.done_input)
            ]

        self.target_action = self.__build_actor(self.TARGET_ACTOR_NAME,
                                                self.next_state_input)

        self.q_value, weight_decay = self.__build_critic(
            self.CRITIC_NAME, self.state_input, self.action_input)
        self.target_q_value, _ = self.__build_critic(self.TARGET_CRITIC_NAME,
                                                     self.next_state_input,
                                                     self.target_action)

        self.tmp = tf.expand_dims(self.reward_input, 1)

        self.targets = tf.expand_dims(self.reward_input, 1) + self.discount * (
            1 - tf.expand_dims(self.done_input, 1)) * self.target_q_value
        self.diff = self.targets - self.q_value

        self.loss = tf.reduce_mean(
            tf.square(tf.stop_gradient(self.targets) -
                      self.q_value)) + weight_decay
        self.loss_summary = tf.summary.scalar("critic_loss", self.loss)

        self.critic_train_op = tf.train.AdamOptimizer(
            self.critic_learning_rate).minimize(self.loss)

        # add critic batch norm. update
        if self.input_batch_norm or self.all_batch_norm:
            self.critic_bn_update_op = tf.get_collection(
                tf.GraphKeys.UPDATE_OPS, scope=self.CRITIC_NAME)
            self.critic_bn_update_op = tf.group(*self.critic_bn_update_op)
            self.critic_train_op = tf.group(self.critic_train_op,
                                            self.critic_bn_update_op)

        self.action = self.__build_actor(self.ACTOR_NAME, self.state_input)
        self.actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              scope=self.ACTOR_NAME)
        self.action_gradients = tf.gradients(self.q_value,
                                             self.action_input)[0]
        self.actor_params_gradient = tf.gradients(self.action,
                                                  self.actor_params,
                                                  -self.action_gradients)

        # actor gradients summary
        if self.detail_summary:
            self.actor_summaries.append(
                tf.summary.histogram("action_gradient", self.action_gradients))
            for grad in self.actor_params_gradient:
                self.actor_summaries.append(
                    tf.summary.histogram("actor_parameter_gradients", grad))

        self.actor_train_op = tf.train.AdamOptimizer(
            self.actor_learning_rate).apply_gradients(
                zip(self.actor_params_gradient, self.actor_params))

        # add actor batch norm. update
        if self.input_batch_norm or self.all_batch_norm:
            self.actor_bn_update_op = tf.get_collection(
                tf.GraphKeys.UPDATE_OPS, scope=self.ACTOR_NAME)
            self.actor_bn_update_op = tf.group(*self.actor_bn_update_op)
            self.actor_train_op = tf.group(self.actor_train_op,
                                           self.actor_bn_update_op)

        self.target_critic_update = architect.create_target_update_ops(
            self.CRITIC_NAME, self.TARGET_CRITIC_NAME,
            self.critic_target_update_rate)
        self.target_actor_update = architect.create_target_update_ops(
            self.ACTOR_NAME, self.TARGET_ACTOR_NAME,
            self.actor_target_update_rate)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.inc_global_step = tf.assign(self.global_step,
                                         tf.add(self.global_step, 1))

        # group summaries
        self.critic_summaries = tf.summary.merge(self.critic_summaries)

        if self.detail_summary:
            self.actor_summaries = tf.summary.merge(self.actor_summaries)
            self.input_summaries = tf.summary.merge(self.input_summaries)

    @staticmethod
    def __get_weights(shape, input_shape, name="var"):
        return tf.Variable(tf.random_uniform(shape,
                                             -1 / math.sqrt(input_shape),
                                             1 / math.sqrt(input_shape)),
                           name=name)

    def __train_actor(self, states):

        actions = self.session.run(self.action,
                                   feed_dict={
                                       self.state_input: states,
                                       self.is_training: True
                                   })

        self.session.run(self.actor_train_op,
                         feed_dict={
                             self.state_input: states,
                             self.action_input: actions,
                             self.is_training: True
                         })

    def __train_critic(self, states, actions, rewards, next_states, done):
        feed_dict = {
            self.state_input: states,
            self.action_input: actions,
            self.reward_input: rewards,
            self.next_state_input: next_states,
            self.done_input: done,
            self.is_training: True
        }
        step = self.session.run(self.global_step)

        if step % self.log_frequency == 0:

            ops = [self.critic_train_op, self.loss_summary]

            if self.detail_summary:
                ops.append(self.actor_summaries)
                ops.append(self.input_summaries)

            res = self.session.run(ops, feed_dict=feed_dict)

            self.summary_writer.add_summary(res[1], global_step=step)

            if self.detail_summary:
                self.summary_writer.add_summary(res[2], global_step=step)
                self.summary_writer.add_summary(res[3], global_step=step)
        else:
            self.session.run(self.critic_train_op, feed_dict=feed_dict)
Exemple #11
0
class Agents():
    def __init__(self, args):
        self.args = args
        self.policy = [Q_net(args) for _ in range(args.n_agents)]
        self.hyperNet = HyperNet(args)
        self.policy_target = [copy.deepcopy(p) for p in self.policy]
        self.hyperNet_target = copy.deepcopy(self.hyperNet)
        self.replayBuffer = ReplayBuffer(args)
        self.preference_pool = Preference(args)
        policy_param = [policy.parameters() for policy in self.policy]
        self.optim = torch.optim.Adam(itertools.chain(
            *policy_param, self.hyperNet.parameters()),
                                      lr=self.args.learning_rate)
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optim,
                                                            step_size=100,
                                                            gamma=0.9,
                                                            last_epoch=-1)
        self.step = 0

    def choose_action(self, obs, preference, epsilon):
        obs = np.array(obs).transpose((1, 0, 2))
        preference = np.array(preference).transpose((1, 0, 2))
        act = np.array([
            self.policy[i].choose_action(obs[i], preference[i], epsilon)
            for i in range(self.args.n_agents)
        ])

        return act.transpose((2, 0, 1))

    def learn(self):
        def combine(obs, pref):
            ow = []
            n_pref = len(pref)
            for w in range(n_pref):
                ow.append(
                    torch.cat([obs,
                               pref[w]]).unsqueeze(0).to(self.args.device))
            ow = torch.cat(ow, dim=0)
            return ow.unsqueeze(0)

        sample = self.replayBuffer.sample(self.args.batch_size)
        batch_w = self.preference_pool.sample(self.args.batch_size_p,
                                              train=True)
        obs = sample["obs"]
        obs_ = sample["next_obs"]
        act = sample["act"]
        rew = sample["rew"]
        state = sample["state"]
        state_ = sample["next_state"]
        Q_ = []
        ####################################################################
        for i in range(self.args.batch_size):
            Q_.append([])
            for j in range(self.args.batch_size_p):
                Q_[i].append(
                    torch.cat([
                        combine(obs_[a][i], batch_w[a])
                        for a in range(self.args.n_agents)
                    ],
                              dim=0).unsqueeze(0))
            Q_[i] = torch.cat(Q_[i], dim=0)
        Q_ = torch.cat(Q_, dim=0).permute(1, 0, 2, 3)
        ####################################################################
        Q_ = torch.cat([
            self.policy[a].get_target_q(Q_[a], batch_w[a][0]).unsqueeze(0)
            for a in range(self.args.n_agents)
        ],
                       dim=0)
        Q_ = Q_.squeeze(-1).permute(2, 0, 1).view(-1, self.args.n_agents * 3)
        obs = [
            torch.cat([obs[i] for _ in range(self.args.batch_size_p)])
            for i in range(self.args.n_agents)
        ]
        w = copy.deepcopy(batch_w[0])
        batch_w = [
            batch_w[i].data.cpu().numpy().repeat(self.args.batch_size, axis=0)
            for i in range(self.args.n_agents)
        ]
        Q = torch.cat([
            self.policy[i].get_q(obs[i], batch_w[i], act[i])
            for i in range(self.args.n_agents)
        ],
                      dim=-1)
        Q_tot = self.hyperNet.get_Q_tot(state, w, Q)
        Q_tot_target = self.hyperNet_target.get_Q_tot(state_, w, Q_).detach()
        rew = rew.unsqueeze(0).repeat([self.args.batch_size_p, 1,
                                       1]).view(-1, self.args.n_obj)
        loss = self.loss_func(Q_tot, Q_tot_target, rew, w)
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        self.lr_scheduler.step()
        # print("learning rate:", self.optim)

    def loss_func(self, Q, Q_target, R, w):
        R = self.convert_type(R)
        w = self.convert_type(w)
        y = R + Q_target
        w = w.repeat([self.args.batch_size, 1]).view(-1, self.args.n_obj)
        La = torch.norm(y - Q, p=2, dim=-1).mean()
        wy = torch.bmm(w.unsqueeze(1), y.unsqueeze(-1))
        wq = torch.bmm(w.unsqueeze(1), Q.unsqueeze(-1))
        Lb = torch.abs(wy - wq).mean()
        # loss = La + Lb
        loss = La
        return loss

    def push(self, traj):
        self.replayBuffer.push(traj["obs"], traj["acts"], traj["rew"],
                               traj["next_obs"], traj["done"], traj["state"],
                               traj["next_state"], traj["pref"])

    def update_target(self):
        self.step += 1
        if self.step % 1000 == 0:
            print("updating target nets")
            self.hyperNet_target.load_state_dict(self.hyperNet.state_dict())
            for i in range(len(self.policy)):
                self.policy_target[i].load_state_dict(
                    self.policy[i].state_dict())

    def convert_type(self, input):
        if not isinstance(input, torch.Tensor):
            input = torch.Tensor(input)
        if input.device != torch.device(self.args.device):
            input = input.to(self.args.device)

        return input

    def save_model(self, ep, path='./model/MOQMIX/'):
        print("saving model")
        state = {}
        for i in range(len(self.policy)):
            state['policy{0}'.format(i)] = self.policy[i].state_dict()
            state['target_policy{0}'.format(
                i)] = self.policy_target[i].state_dict()
        state['hyperNet'] = self.hyperNet.state_dict()
        state['target_hyperNet'] = self.hyperNet_target.state_dict()
        state['optim'] = self.optim.state_dict()
        state['lr_scheduler'] = self.lr_scheduler.state_dict()
        state['epoch'] = ep
        torch.save(state, path + "model.pth")

    def load_model(self, path='./model/MOQMIX', device='cpu'):
        state = torch.load(path + "model.pth", map_location=device)
        for i in range(len(self.policy)):
            self.policy[i].load_state_dict(state['policy{0}'.format(i)])
            self.policy_target[i].load_state_dict(
                state['target_policy{0}'.format(i)])
        self.hyperNet = state['hyperNet']
        self.hyperNet_target = state['target_hyperNet']
        self.optim = state['optim']
        self.lr_scheduler = state['lr_scheduler']
        return state['epoch']
Exemple #12
0
class DeepQNetwork:

  ACTION_VALUE_NET_NAME = "q-network"
  TARGET_ACTION_VALUE_NET_NAME = "target-q-network"

  def __init__(self, network, prep, exp_policy, state_dim, action_dim, name, learning_rate=1e-3,
               hard_update_frequency=500, soft_update_rate=None, buffer_size=50000, batch_size=32, num_steps=200000,
               discount=0.99, use_huber_loss=True, detailed_summary=False, max_reward=200, steps_before_learn=1000,
               train_freq=1, save_end=True):

    self.network = network
    self.prep = prep
    self.exp_policy = exp_policy
    self.greedy_policy = policy.Greedy()
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.discount = discount
    self.summary_dir = os.path.join(name, "summary")
    self.use_huber_loss = use_huber_loss
    self.detailed_summary = detailed_summary

    self.learning_rate = learning_rate
    self.batch_size = batch_size
    self.hard_update_frequency = hard_update_frequency
    self.soft_update_rate = soft_update_rate
    self.num_steps = num_steps
    self.step = 0
    self.steps_before_learn = steps_before_learn
    self.train_freq = train_freq
    self.solved = False
    self.max_reward = max_reward
    self.save_end = save_end

    self.actions = None
    self.rewards = None
    self.done = None
    self.action_q_values = None
    self.max_target_q_values = None
    self.targets = None
    self.global_step = None
    self.inc_global_step = None
    self.train_op = None
    self.states = None
    self.q_values = None
    self.next_states = None
    self.target_q_values = None
    self.target_update = None

    self.build_all()
  
    self.merged = tf.summary.merge_all()

    self.session = tf.Session()

    self.summary_dir = utils.new_summary_dir(self.summary_dir)
    self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph)

    self.saver = tf.train.Saver(max_to_keep=None)

    init_op = tf.global_variables_initializer()
    self.session.run(init_op)

    self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim)

  def build_all(self):

    self.actions = tf.placeholder(tf.float32, (None, self.action_dim), name="actions")
    self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")
    self.done = tf.placeholder(tf.float32, (None,), name="done")

    self.build_network()
    self.build_target_network()

    if self.soft_update_rate is not None:
      self.create_soft_target_update_op()
    else:
      self.create_hard_target_update_op()

    self.action_q_values = tf.reduce_sum(self.q_values * self.actions, axis=1)
    self.max_target_q_values = tf.reduce_max(self.target_q_values, axis=1)

    self.targets = self.rewards + (1 - self.done) * (self.discount * self.max_target_q_values)

    if self.detailed_summary:
      architect.variable_summaries(self.targets, name="targets")

    td_diff = self.action_q_values - tf.stop_gradient(self.targets)

    if self.use_huber_loss:
      loss = tf.reduce_mean(architect.huber_loss(td_diff))
    else:
      loss = tf.reduce_mean(tf.pow(td_diff, 2))

    tf.summary.scalar("loss", loss)

    self.global_step = tf.Variable(0, name='global_step', trainable=False)
    self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1))
    self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)

  def build_network(self):
    self.states, self.q_values = self.network.build(self.state_dim, self.action_dim, self.ACTION_VALUE_NET_NAME)

  def build_target_network(self):
    self.next_states, self.target_q_values = self.network.build(self.state_dim, self.action_dim, self.TARGET_ACTION_VALUE_NET_NAME)

  def create_soft_target_update_op(self):
    # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py
    net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME)
    target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME)

    self.target_update = []
    for v_source, v_target in zip(net_vars, target_net_vars):
      # this is equivalent to target = (1-alpha) * target + alpha * source
      update_op = v_target.assign_sub(self.soft_update_rate * (v_target - v_source))
      self.target_update.append(update_op)

    self.target_update = tf.group(*self.target_update)

  def create_hard_target_update_op(self):
    net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME)
    target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME)

    self.target_update = []
    for v_source, v_target in zip(net_vars, target_net_vars):
      update_op = v_target.assign(v_source)
      self.target_update.append(update_op)

    self.target_update = tf.group(*self.target_update)

  def learn(self):
    # learn
    batch = self.buffer.sample(self.batch_size)

    merged, _ = self.session.run([self.merged, self.train_op], feed_dict={
      self.states: batch["states"],
      self.actions: batch["actions"],
      self.rewards: batch["rewards"],
      self.next_states: batch["next_states"],
      self.done: batch["done"]
    })

    self.summary_writer.add_summary(merged, global_step=self.step)

    # target update
    if self.soft_update_rate is not None:
      self.session.run(self.target_update)
    elif self.step % self.hard_update_frequency == 0:
      self.session.run(self.target_update)

  def run_episode(self, env):

    state = env.reset()
    state, skip = self.prep.process(state)

    total_reward = 0

    while True:
      # play
      if skip:
        action = env.action_space.sample()
      else:
        q_values = self.session.run(self.q_values, feed_dict={self.states: state})[0]

        if self.solved:
          action = self.greedy_policy.select_action(q_values)
        else:
          action = self.exp_policy.select_action(q_values)

      action_one_hot = np.zeros(self.action_dim)
      action_one_hot[action] = 1

      tmp_state = state
      tmp_skip = skip

      state, reward, done, info = env.step(action)
      state, skip = self.prep.process(state)

      total_reward += reward

      if not tmp_skip and not tmp_skip:
        self.buffer.add({
            "state": tmp_state[0],
            "action": action_one_hot,
            "reward": reward,
            "next_state": state[0],
            "done": int(done)
          })

      if self.step >= self.steps_before_learn and self.step % self.train_freq == 0 and not self.solved:
        # learn
        self.learn()

      _, self.step = self.session.run([self.inc_global_step, self.global_step])

      if done:
        break

    summary_value = summary_pb2.Summary.Value(tag="episode_reward", simple_value=total_reward)
    summary_2 = summary_pb2.Summary(value=[summary_value])
    self.summary_writer.add_summary(summary_2, global_step=self.step)

    if total_reward >= self.max_reward:
      self.solved = True
    else:
      self.solved = False

    if self.step == self.num_steps:
      self.saver.save(self.session, self.summary_dir, global_step=self.step)

    return total_reward, self.step

  def close(self):
    self.session.close()
Exemple #13
0
class BaseDQN(bp.Policy):
    """
    An abstract base class that implements Deep Q-Learning and allows for customization - to be extended by other
    policies that we wrote
    """
    def cast_string_args(self, policy_args):
        policy_args['epsilon'] = float(
            policy_args['epsilon']) if 'epsilon' in policy_args else EPSILON
        policy_args['gamma'] = float(
            policy_args['gamma']) if 'gamma' in policy_args else GAMMA
        self.huber_loss = False
        self.use_softmax_sampling = True

        self.epsilon_decay = 0.90
        self.min_epsilon = MIN_EPSILON

        self.learning_rate = 1e-4
        self.batch_size = 96

        self.state_radius = 5
        self.state_rep = SQUARE
        self.step_forward = True
        self.flatten = FULL

        self.doubleDQN = False

        self.save_model_round = 250

        self.augment_after_normaliztion = False
        policy_args = self._additional_args(policy_args)

        return policy_args

    def _save_model(self):
        self.old_model.set_weights(self.model.get_weights())

    def init_run(self):
        self.log("Starting init")
        self.r_sum = 0

        if self.state_rep == SQUARE:
            self.state_proc = SquareAroundHeadState(
                radius=self.state_radius,
                step_forward=self.step_forward,
                flatten=self.flatten)
        elif self.state_rep == DIAMOND:
            self.state_proc = DiamondAroundHeadState(
                radius=self.state_radius,
                step_forward=self.step_forward,
                flatten=self.flatten)
        elif self.state_rep == RADAR:
            self.state_proc = RadarState(num_per_type=NUM_PER_TYPE)

        self.input_shape = self.state_proc.get_shape()

        self.model = self._build_model()
        self.model.summary()

        if self.huber_loss:
            loss = huber_loss
        else:
            loss = 'mse'

        opt = Adam(self.learning_rate)
        self.model.compile(loss=loss, optimizer=opt)

        self.old_model = keras.models.clone_model(self.model)
        self._save_model()

        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.log("Init finished!")

        self.num_of_samples = 0
        self.sum_of_loss = 0

    def learn(self, round, prev_state, prev_action, reward, new_state,
              too_slow):
        try:
            if round % 100 == 0:
                if round > self.game_duration - self.score_scope:
                    self.log(
                        "Rewards in last 100 rounds which counts towards the score: {}, eps={:.2f}, "
                        "db_size={}".format(str(self.r_sum), self.epsilon,
                                            len(self.memory)), 'VALUE')
                else:
                    total_loss = self.sum_of_loss / self.num_of_samples
                    self.num_of_samples = self.sum_of_loss = 0
                    self.log(
                        "Rewards in last 100 rounds: {}, eps={:.2f}, db_size={}, loss={:.3f}"
                        .format(str(self.r_sum), self.epsilon,
                                len(self.memory), total_loss), 'VALUE')
                self.r_sum = 0
            else:
                self.r_sum += reward
        except Exception as e:
            self.log("Something Went Wrong...", 'EXCEPTION')
            self.log(e, 'EXCEPTION')

        prev, actions, rewards, new = self.memory.sample(self.batch_size)

        if self.doubleDQN:
            target = rewards + self.gamma * self.old_model.predict(new)[
                range(len(new)),
                np.argmax(self.model.predict(new), axis=1)]
        else:
            target = rewards + self.gamma * np.amax(
                self.old_model.predict(new), axis=1)
        target_f = self.model.predict(prev)

        try:
            target_f[range(len(actions)), actions] = target
            hist = self.model.fit(prev,
                                  target_f,
                                  epochs=1,
                                  verbose=0,
                                  batch_size=len(prev),
                                  shuffle=True)
            self.sum_of_loss += np.sum(hist.history['loss'])
            self.num_of_samples += len(hist.history['loss'])
        except Exception as e:
            print(e)

        if round % self.save_model_round == 0 and round > 0:
            self._save_model()
        if round % 200 == 0 and round > 0 and self.epsilon > 0:
            self.epsilon = max(self.epsilon * self.epsilon_decay,
                               self.min_epsilon)

    def act(self, round, prev_state, prev_action, reward, new_state, too_slow):
        if round > self.game_duration - self.score_scope:
            # cancel exploration during "money-time"
            self.use_softmax_sampling = False
            self.epsilon = 0

        new_state_repr = self.state_proc.get_state_repr(new_state)

        if prev_state is not None:
            prev_state_repr = self.state_proc.get_state_repr(prev_state)
            self.memory.record(prev_state_repr,
                               bp.Policy.ACTIONS.index(prev_action), reward,
                               new_state_repr)
            if self.augment_after_normaliztion and prev_state[1][
                    1] == new_state[1][1]:
                self.memory.record(*augment_after_normaliztion(
                    prev_state_repr, prev_state[1][1],
                    bp.Policy.ACTIONS.index(prev_action), reward,
                    new_state_repr, new_state[1][1], self.state_radius))

        if self.use_softmax_sampling:
            return np.random.choice(
                bp.Policy.ACTIONS,
                p=softmax(
                    self.model.predict(new_state_repr[np.newaxis]) /
                    self.epsilon).squeeze())
        else:  # use epsilon-greedy
            if np.random.rand() < self.epsilon:
                return np.random.choice(bp.Policy.ACTIONS)
            else:
                prediction = self.model.predict(new_state_repr[np.newaxis])[0]
                action = bp.Policy.ACTIONS[np.argmax(prediction)]
                return action

    @abstractmethod
    def _build_model(self) -> Model:
        raise NotImplementedError

    @abstractmethod
    def _additional_args(self, policy_args):
        raise NotImplementedError