Example #1
0
    def training_with_policy(self, expert_policy, max_imitation_learning_step):

        self.step = 0
        s = self.env.reset()
        buffer = ReplayMemory(self.batch_size, ["value", "logp"])
        expert_action_set,generator_action_set=[],[]
        while self.step < max_imitation_learning_step:
            expert_action = expert_policy(s)
            generator_action = self.policy_network.forward(s)
            s_, r, done, info = self.env.step(generator_action.cpu().squeeze(0).numpy())
            Q = self.value_network.forward(s)
            IL_reward = self.Discriminator_training(s, expert_action, generator_action)
            sample_ = {
                "s": s,
                "a": generator_action.squeeze(0),
                "r": IL_reward,
                "tr": torch.tensor([int(done)]),
                "s_":torch.from_numpy(s_),
                "logp": -1.9189,
                "value": Q}

            buffer.push(sample_)
            # expert_action_set.append(expert_action)
            # generator_action_set.append(generator_action)

            if self.step % self.batch_size==0 and self.step>1:
                self.base_algorithm.update(buffer.memory)
Example #2
0
    def __init__(self, env, actor_model, critic_model,
                 actor_lr=1e-4, critic_lr=3e-4,
                 actor_target_network_update_freq=0.1, critic_target_network_update_freq=0.1,
                 actor_training_freq=2, critic_training_freq=1,
                 ## hyper-parameter
                 gamma=0.99, batch_size=32, buffer_size=50000, learning_starts=1000,
                 ## decay
                 decay=False, decay_rate=0.9, l2_regulization=0.01,
                 ##
                 path=None):

        self.gpu = False
        self.env = env
        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq
        self.actor_target_network_update_freq = actor_target_network_update_freq
        self.critic_target_network_update_freq = critic_target_network_update_freq

        self.replay_buffer = ReplayMemory(buffer_size)
        self.actor = actor_model
        self.critic = critic_build(critic_model)

        self.actor_critic = actor_critic(self.actor, self.critic)

        self.target_actor = deepcopy(self.actor)
        self.target_critic = deepcopy(self.critic)

        actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
        critic_optim = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=l2_regulization)
        if decay:
            self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(actor_optim, decay_rate, last_epoch=-1)
            self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(critic_optim, decay_rate, last_epoch=-1)
        else:
            self.actor_optim = actor_optim
            self.critic_optim = critic_optim

        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1, norm_type=2)
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1, norm_type=2)


        super(TD3_Agent, self).__init__(path)
        example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        self.writer.add_graph(self.actor_critic, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = []
        self.forward_ep_show_list = []
        self.backward_ep_show_list = []
Example #3
0
    def Imitation_Learning(self, step_time, data=None, policy=None,learning_start=1000,
                           buffer_size = 5000, value_training_round = 10, value_training_fre = 2500,
                           verbose=2,render = False):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception("The IL only need one way to guide, Please make sure the input ")

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            buffer = ReplayMemory(buffer_size)
            s = self.env.reset()
            loss_BC = 0
            ep_step,ep_reward = 0, 0
            for _ in range(step_time):
                self.step += 1
                ep_step += 1
                a = policy(self.env)
                s_, r, done, info = self.env.step(a)
                #print(r,info)
                ep_reward += r
                if render:
                    self.env.render()
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                buffer.push(sample)
                s = s_[:]
                if self.step > learning_start:
                    sample_ = buffer.sample(self.batch_size)
                    loss = self.policy_behavior_clone(sample_)
                    if self.step % value_training_fre==0:
                        record_sample = {}
                        for key in buffer.memory.keys():
                            record_sample[key] = np.array(buffer.memory[key]).astype(np.float32)[-value_training_fre:]
                        record_sample["value"] = self.value.forward(torch.from_numpy(record_sample["s"]))
                        returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"],
                                                   self.gamma, self.lam)
                        record_sample["advs"] = advants
                        record_sample["return"] = returns
                        for round_ in range(value_training_round):
                            loss_value = self.value_pretrain(record_sample, value_training_fre)
                            print(round_, loss_value)

                    if verbose == 1:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("loss", loss)
                        logger.record_tabular("rewrad",r)
                        logger.dumpkvs()
                    loss_BC += loss
                if done:
                    if verbose == 2:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("step_used", ep_step)
                        logger.record_tabular("loss", loss_BC/ep_step)
                        logger.record_tabular("ep_reward",ep_reward )
                        logger.dumpkvs()

                    s = self.env.reset()
                    loss_BC = 0
                    ep_step,ep_reward = 0, 0
Example #4
0
    def runner(self, sample_step=None, sample_ep=None, max_ep_step=2000, record_ep_inter=None, lstm_enable=False):
        if sample_step is not None:
            buffer = ReplayMemory(sample_step, ["value", "logp","info"])
        else:
            buffer = ReplayMemory(sample_ep*max_ep_step, ["value", "logp","info"])
        s = self.env.reset()
        ep_reward, ep_Q_value, ep_step_used = [], [], []
        ep_r, ep_q, ep_cycle = 0, 0, 0
        while True:
            s = torch.from_numpy(s.astype(np.float32))
            with torch.no_grad():
                outcome = self.policy.forward(s.unsqueeze(0))
                Q = self.value.forward(s.unsqueeze(0))
            pd = self.dist(outcome)
            a = pd.sample()
            s_, r, done, info = self.env.step(a.cpu().squeeze(0).numpy())
            if self.render:
                self.env.render()
            ep_r += r
            ep_q += Q
            ep_cycle +=1
            self.step += 1
            logp = pd.log_prob(a)
            sample_ = {
                "s": s,
                "a": a.squeeze(0),
                "r": torch.tensor(np.array([r]).astype(np.float32)),
                "tr": torch.tensor([int(done)]),
                "s_":torch.from_numpy(s_),
                "logp": logp.squeeze(0),
                "value": Q.squeeze(0),
                "info": info}
            buffer.push(sample_)
            s = deepcopy(s_)

            if record_ep_inter is not None:
                if self.episode % record_ep_inter == 0:
                    kvs = {"s": s, "a": a, "s_": s_, "r": r,
                           "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle}
                    self.csvwritter.writekvs(kvs)

            if done:
                s = self.env.reset()
                self.episode += 1
                ep_reward.append(ep_r)
                ep_Q_value.append(ep_q)
                ep_step_used.append(ep_cycle)
                ep_r, ep_q, ep_cycle = 0, 0, 0
                if lstm_enable:
                    self.policy.reset_h()

            if sample_step is not None:
                if self.step > 0 and self.step % sample_step==0:
                    s_ = torch.from_numpy(s_[np.newaxis,:].astype(np.float32))
                    with torch.no_grad():
                        last_Q = self.value.forward(s_).squeeze()
                    #print("now is we have sampled for :", self.step , "and" , self.episode,"\n",
                    #      "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode",
                    #      "and the mean reward per step is",  np.mean(buffer.memory["r"]),
                    #      "the mean ep reward is ", np.mean(ep_reward))
                    yield {"buffer": buffer.memory,
                           "ep_reward": ep_reward,
                           "ep_Q_value": ep_Q_value,
                           "ep_step_used": ep_step_used,
                           "ep_used": len(ep_reward),
                           "step_used": sample_step,
                           "last_Q" : last_Q
                           }
                    ep_reward, ep_Q_value, ep_step_used = [], [], []
                    if sample_step is not None:
                        buffer = ReplayMemory(sample_step, ["value", "logp","info"])
                    else:
                        buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"])

            else:
                if self.step > 0 and self.episode % sample_ep==0:
                    s_ = torch.from_numpy(s_.astype(np.float32))
                    last_Q = self.value.forward(s_)
                    #print("now is we have sampled for :", self.step , "and" , self.episode,"\n",
                    #      "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode",
                    #      "and the mean reward per step is",  np.mean(buffer.memory["r"]),
                     #     "the mean ep reward is ", np.mean(ep_reward))
                    yield {"buffer": buffer.memory,
                           "ep_reward": ep_reward,
                           "ep_Q_value": ep_Q_value,
                           "ep_step_used": ep_step_used,
                           "ep_used": sample_ep,
                           "step_used": len(buffer.memory["tr"]),
                           "last_Q": last_Q
                           }
                    ep_reward, ep_Q_value = [], []
                    if sample_step is not None:
                        buffer = ReplayMemory(sample_step, ["value", "logp","info"])
                    else:
                        buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"])
Example #5
0
class PPO_Agent(Agent_policy_based):
    def __init__(
            self,
            env,
            policy_model,
            value_model,
            lr=5e-4,
            ent_coef=0.01,
            vf_coef=0.5,
            ## hyper-parawmeter
            gamma=0.99,
            lam=0.95,
            cliprange=0.2,
            batch_size=64,
            value_train_round=200,
            running_step=2048,
            running_ep=20,
            value_regular=0.01,
            buffer_size=50000,
            ## decay
            decay=False,
            decay_rate=0.9,
            lstm_enable=False,
            ##
            path=None):
        self.gpu = False
        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.cliprange = cliprange

        self.value_train_step = value_train_round

        self.sample_rollout = running_step
        self.sample_ep = running_ep
        self.batch_size = batch_size
        self.lstm_enable = lstm_enable
        self.replay_buffer = ReplayMemory(buffer_size,
                                          other_record=["value", "return"])

        self.loss_cal = torch.nn.SmoothL1Loss()

        self.policy = policy_model
        if value_model == "shared":
            self.value = policy_model
        elif value_model == "copy":
            self.value = deepcopy(policy_model)
        else:
            self.value = value_model

        self.dist = make_pdtype(env.action_space, policy_model)

        self.policy_model_optim = Adam(self.policy.parameters(), lr=lr)
        self.value_model_optim = Adam(self.value.parameters(),
                                      lr=lr,
                                      weight_decay=value_regular)
        if decay:
            self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(
                self.policy_model_optim, decay_rate, last_epoch=-1)
            self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(
                self.value_model_optim, decay_rate, last_epoch=-1)

        #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
        #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2)

        super(PPO_Agent, self).__init__(path)
        #example_input = Variable(torch.rand((100,)+self.env.observation_space.shape))
        #self.writer.add_graph(self.policy, input_to_model=example_input)

        self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
        self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]

        self.training_round = 0
        self.running_step = 0
        self.record_sample = None
        self.training_step = 0

    def update(self, sample):
        step_len = len(sample["s"])
        for ki in range(step_len):
            sample_ = {
                "s": sample["s"][ki].cpu().numpy(),
                "a": sample["a"][ki].cpu().numpy(),
                "r": sample["r"][ki].cpu().numpy(),
                "tr": sample["tr"][ki].cpu().numpy(),
                "s_": sample["s_"][ki].cpu().numpy(),
                "value": sample["value"][ki].cpu().numpy(),
                "return": sample["return"][ki].cpu().numpy()
            }
            self.replay_buffer.push(sample_)
        '''
        train the value part
        '''
        vfloss_re = []
        for _ in range(self.value_train_step):
            tarin_value_sample = self.replay_buffer.sample(self.batch_size)
            for key in tarin_value_sample.keys():
                if self.gpu:
                    tarin_value_sample[key] = tarin_value_sample[key].cuda()
                else:
                    tarin_value_sample[key] = tarin_value_sample[key]
            old_value = tarin_value_sample["value"]
            training_s = tarin_value_sample["s"]
            R = tarin_value_sample["return"]
            value_now = self.value.forward(training_s).squeeze()
            # value loss
            value_clip = old_value + torch.clamp(
                old_value - value_now, min=-self.cliprange,
                max=self.cliprange)  # Clipped value
            vf_loss1 = self.loss_cal(value_now, R)  # Unclipped loss
            vf_loss2 = self.loss_cal(value_clip, R)  # clipped loss
            vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
            self.value_model_optim.zero_grad()
            vf_loss1.backward()
            self.value_model_optim.step()
            vfloss_re.append(vf_loss1.cpu().detach().numpy())
        '''
        train the policy part
        '''

        for key in sample.keys():
            temp = torch.stack(list(sample[key]), 0).squeeze()
            if self.gpu:
                sample[key] = temp.cuda()
            else:
                sample[key] = temp

        array_index = []
        time_round = np.ceil(step_len / self.batch_size)
        time_left = time_round * self.batch_size - step_len
        array = list(range(step_len)) + list(range(int(time_left)))
        array_index = []
        for train_time in range(int(time_round)):
            array_index.append(
                array[train_time * self.batch_size:(train_time + 1) *
                      self.batch_size])

        loss_re, pgloss_re, enloss_re = [], [], []
        for train_time in range(int(time_round)):
            index = array_index[train_time]
            # for index in range(step_len):
            training_s = sample["s"][index].detach()
            training_a = sample["a"][index].detach()
            old_neglogp = sample["logp"][index].detach()
            advs = sample["advs"][index].detach()

            " CALCULATE THE LOSS"
            " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"

            #generate Policy gradient loss
            outcome = self.policy.forward(training_s).squeeze()
            # new_neg_lop = torch.empty(size=(self.batch_size,))
            # for time in range(self.batch_size):
            #     new_policy = self.dist(outcome[time])
            #     new_neg_lop[time] = new_policy.log_prob(training_a[time])
            new_policy = self.dist(outcome)
            new_neg_lop = new_policy.log_prob(training_a)
            ratio = torch.exp(new_neg_lop - old_neglogp)
            pg_loss1 = -advs * ratio
            pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange,
                                           1.0 + self.cliprange)
            pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean()

            # entropy
            entropy = new_policy.entropy().mean()
            # loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
            loss = pg_loss - entropy * self.ent_coef
            self.policy_model_optim.zero_grad()
            loss.backward()
            self.policy_model_optim.step()
            # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
            # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
            loss_re = loss.cpu().detach().numpy()
            pgloss_re.append(pg_loss.cpu().detach().numpy())
            enloss_re.append(entropy.cpu().detach().numpy())

        return np.sum(loss_re), {
            "pg_loss": np.sum(pgloss_re),
            "entropy": np.sum(enloss_re),
            "vf_loss": np.sum(vfloss_re)
        }

    def load_weights(self, filepath):
        model = torch.load(filepath + "/PPO.pkl")
        self.policy.load_state_dict(model["policy"].state_dict())
        self.value.load_state_dict(model["value"].state_dict())

    def save_weights(self, filepath, overwrite=False):
        torch.save({
            "policy": self.policy,
            "value": self.value
        }, filepath + "/PPO.pkl")

    def policy_behavior_clone(self, sample_):
        action_label = sample_["a"].squeeze()
        if self.gpu:
            action_predict = self.policy(sample_["s"].cuda())
            action_label = action_label.cuda()
        else:
            action_predict = self.policy(sample_["s"])
        loss_bc = self.loss_cal(action_label, action_predict)
        del action_label
        del action_predict
        loss = loss_bc
        self.policy_model_optim.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
        self.policy_model_optim.step()
        return loss.cpu().detach().numpy()

    def value_pretrain(self, record_sample, new_sample_len):
        train_times = int(np.floor(new_sample_len / 128))
        round_loss = 0
        for io in range(train_times - 1):
            index = list(range(128 * io, 128 * (io + 1)))
            if self.gpu:
                predict = torch.from_numpy(
                    np.array(record_sample["s"])[index]).cuda()
                lable = torch.from_numpy(np.array(
                    record_sample["return"]))[index].cuda()
            else:
                predict = torch.from_numpy(np.array(record_sample["s"])[index])
                lable = torch.from_numpy(np.array(
                    record_sample["return"]))[index]
            value_now = self.value.forward(predict)
            # value loss
            vf_loss = self.loss_cal(value_now, lable)  # Unclipped loss
            del predict
            del lable
            self.value_model_optim.zero_grad()
            vf_loss.backward()
            self.value_model_optim.step()
            round_loss += vf_loss.cpu().detach().numpy()
        return round_loss

    def cuda(self, device=None):
        self.policy.to_gpu(device)
        self.value.to_gpu(device)
        self.loss_cal = self.loss_cal.cuda(device)
        self.gpu = True
Example #6
0
    def __init__(
            self,
            env,
            policy_model,
            value_model,
            lr=5e-4,
            ent_coef=0.01,
            vf_coef=0.5,
            ## hyper-parawmeter
            gamma=0.99,
            lam=0.95,
            cliprange=0.2,
            batch_size=64,
            value_train_round=200,
            running_step=2048,
            running_ep=20,
            value_regular=0.01,
            buffer_size=50000,
            ## decay
            decay=False,
            decay_rate=0.9,
            lstm_enable=False,
            ##
            path=None):
        self.gpu = False
        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.cliprange = cliprange

        self.value_train_step = value_train_round

        self.sample_rollout = running_step
        self.sample_ep = running_ep
        self.batch_size = batch_size
        self.lstm_enable = lstm_enable
        self.replay_buffer = ReplayMemory(buffer_size,
                                          other_record=["value", "return"])

        self.loss_cal = torch.nn.SmoothL1Loss()

        self.policy = policy_model
        if value_model == "shared":
            self.value = policy_model
        elif value_model == "copy":
            self.value = deepcopy(policy_model)
        else:
            self.value = value_model

        self.dist = make_pdtype(env.action_space, policy_model)

        self.policy_model_optim = Adam(self.policy.parameters(), lr=lr)
        self.value_model_optim = Adam(self.value.parameters(),
                                      lr=lr,
                                      weight_decay=value_regular)
        if decay:
            self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(
                self.policy_model_optim, decay_rate, last_epoch=-1)
            self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(
                self.value_model_optim, decay_rate, last_epoch=-1)

        #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
        #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2)

        super(PPO_Agent, self).__init__(path)
        #example_input = Variable(torch.rand((100,)+self.env.observation_space.shape))
        #self.writer.add_graph(self.policy, input_to_model=example_input)

        self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
        self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]

        self.training_round = 0
        self.running_step = 0
        self.record_sample = None
        self.training_step = 0
Example #7
0
    def __init__(
            self,
            env,
            model,
            policy,
            ## hyper-parameter
            gamma=0.90,
            lr=1e-3,
            batch_size=32,
            buffer_size=50000,
            learning_starts=1000,
            target_network_update_freq=500,
            ## decay
            decay=False,
            decay_rate=0.9,
            ## DDqn && DuelingDQN
            double_dqn=True,
            dueling_dqn=False,
            dueling_way="native",
            ## prioritized_replay
            prioritized_replay=False,
            prioritized_replay_alpha=0.6,
            prioritized_replay_beta0=0.4,
            prioritized_replay_beta_iters=None,
            prioritized_replay_eps=1e-6,
            param_noise=False,
            ##
            path=None):
        """

        :param env:      the GYM environment
        :param model:    the Torch NN model
        :param policy:   the policy when choosing action
        :param ep:       the MAX episode time
        :param step:     the MAx step time
         .........................hyper-parameter..................................
        :param gamma:
        :param lr:
        :param batchsize:
        :param buffer_size:
        :param target_network_update_freq:
        .........................further improve way..................................
        :param double_dqn:  whether enable DDQN
        :param dueling_dqn: whether dueling DDQN
        :param dueling_way: the Dueling DQN method
            it can choose the following three ways
            `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta)
        .........................prioritized-part..................................
        :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
        :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
        :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
        :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
        :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
        .........................imitation_learning_part..................................
        :param imitation_learning_policy:     To initial the network with the given policy
        which is supervised way to training the network
        :param IL_time:    supervised training times
        :param network_kwargs:
        """
        self.gpu = False
        self.env = env
        self.policy = policy

        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.target_network_update_freq = target_network_update_freq
        self.double_dqn = double_dqn

        if dueling_dqn:
            self.Q_net = Dueling_dqn(model, dueling_way)
        else:
            self.Q_net = model

        self.target_Q_net = deepcopy(self.Q_net)

        q_net_optim = Adam(self.Q_net.parameters(), lr=lr)
        if decay:
            self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim,
                                                                decay_rate,
                                                                last_epoch=-1)
        else:
            self.optim = q_net_optim

        self.replay_buffer = ReplayMemory(buffer_size)
        self.learning = False
        super(DQN_Agent, self).__init__(path)
        example_input = Variable(
            torch.rand((100, ) + self.env.observation_space.shape))
        self.writer.add_graph(self.Q_net, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = []
        self.forward_ep_show_list = []
        self.backward_ep_show_list = []
Example #8
0
class DQN_Agent(Agent_value_based):
    def __init__(
            self,
            env,
            model,
            policy,
            ## hyper-parameter
            gamma=0.90,
            lr=1e-3,
            batch_size=32,
            buffer_size=50000,
            learning_starts=1000,
            target_network_update_freq=500,
            ## decay
            decay=False,
            decay_rate=0.9,
            ## DDqn && DuelingDQN
            double_dqn=True,
            dueling_dqn=False,
            dueling_way="native",
            ## prioritized_replay
            prioritized_replay=False,
            prioritized_replay_alpha=0.6,
            prioritized_replay_beta0=0.4,
            prioritized_replay_beta_iters=None,
            prioritized_replay_eps=1e-6,
            param_noise=False,
            ##
            path=None):
        """

        :param env:      the GYM environment
        :param model:    the Torch NN model
        :param policy:   the policy when choosing action
        :param ep:       the MAX episode time
        :param step:     the MAx step time
         .........................hyper-parameter..................................
        :param gamma:
        :param lr:
        :param batchsize:
        :param buffer_size:
        :param target_network_update_freq:
        .........................further improve way..................................
        :param double_dqn:  whether enable DDQN
        :param dueling_dqn: whether dueling DDQN
        :param dueling_way: the Dueling DQN method
            it can choose the following three ways
            `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta)
        .........................prioritized-part..................................
        :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
        :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
        :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
        :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
        :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
        .........................imitation_learning_part..................................
        :param imitation_learning_policy:     To initial the network with the given policy
        which is supervised way to training the network
        :param IL_time:    supervised training times
        :param network_kwargs:
        """
        self.gpu = False
        self.env = env
        self.policy = policy

        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.target_network_update_freq = target_network_update_freq
        self.double_dqn = double_dqn

        if dueling_dqn:
            self.Q_net = Dueling_dqn(model, dueling_way)
        else:
            self.Q_net = model

        self.target_Q_net = deepcopy(self.Q_net)

        q_net_optim = Adam(self.Q_net.parameters(), lr=lr)
        if decay:
            self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim,
                                                                decay_rate,
                                                                last_epoch=-1)
        else:
            self.optim = q_net_optim

        self.replay_buffer = ReplayMemory(buffer_size)
        self.learning = False
        super(DQN_Agent, self).__init__(path)
        example_input = Variable(
            torch.rand((100, ) + self.env.observation_space.shape))
        self.writer.add_graph(self.Q_net, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = []
        self.forward_ep_show_list = []
        self.backward_ep_show_list = []

    def forward(self, observation):
        observation = observation[np.newaxis, :].astype(np.float32)
        observation = torch.from_numpy(observation)
        Q_value = self.Q_net.forward(observation)
        Q_value = Q_value.cpu().squeeze().detach().numpy()
        if self.policy is not None:
            action = self.policy.select_action(Q_value)
        else:
            action = np.argmax(Q_value)
        return action, np.max(Q_value), {}

    def backward(self, sample_):
        self.replay_buffer.push(sample_)
        if self.step > self.learning_starts and self.learning:
            sample = self.replay_buffer.sample(self.batch_size)
            if self.gpu:
                for key in sample.keys():
                    sample[key] = sample[key].cuda()
            assert len(sample["s"]) == self.batch_size
            a = sample["a"].long().unsqueeze(1)
            Q = self.Q_net(sample["s"]).gather(1, a)
            if self.double_dqn:
                _, next_actions = self.Q_net(sample["s_"]).max(1, keepdim=True)
                targetQ = self.target_Q_net(sample["s_"]).gather(
                    1, next_actions)
            else:
                _, next_actions = self.target_Q_net(sample["s_"]).max(
                    1, keepdim=True)
                targetQ = self.target_Q_net(sample["s_"]).gather(
                    1, next_actions)
            targetQ = targetQ.squeeze(1)
            Q = Q.squeeze(1)
            expected_q_values = sample["r"] + self.gamma * targetQ * (
                1.0 - sample["tr"])
            loss = torch.mean(huber_loss(expected_q_values - Q))
            self.optim.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.Q_net.parameters(),
                                           1,
                                           norm_type=2)
            self.optim.step()
            if self.step % self.target_network_update_freq == 0:
                self.target_net_update()
            loss = loss.data.numpy()
            return loss, {}
        return 0, {}

    def target_net_update(self):
        self.target_Q_net.load_state_dict(self.Q_net.state_dict())

    def load_weights(self, filepath):
        model = torch.load(filepath + 'DQN.pkl')
        self.Q_net.load_state_dict(model["Q_net"].state_dict())
        self.target_Q_net.load_state_dict(model["target_Q_net"].state_dict())
        # self.optim.load_state_dict(model["optim"])

    def save_weights(self, filepath, overwrite=True):
        torch.save(
            {
                "Q_net": self.Q_net,
                "target_Q_net": self.target_Q_net,
                "optim": self.optim
            }, filepath + "DQN.pkl")

    def cuda(self):
        self.Q_net = gpu_foward(self.Q_net)
        self.target_Q_net = deepcopy(self.Q_net)
        self.gpu = True
Example #9
0
    def __init__(
            self,
            env,
            actor_model,
            critic_model,
            actor_lr=1e-4,
            critic_lr=1e-3,
            actor_target_network_update_freq=1000,
            critic_target_network_update_freq=1000,
            actor_training_freq=1,
            critic_training_freq=1,
            sperate_critic=False,
            ## hyper-parameter
            gamma=0.99,
            batch_size=32,
            buffer_size=50000,
            learning_starts=1000,
            ## lr_decay
            decay=False,
            decay_rate=0.9,
            critic_l2_reg=1e-2,
            clip_norm=None,
            ##
            path=None):

        self.gpu = False
        self.env = env
        self.sperate_critic = sperate_critic
        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts

        self.replay_buffer = ReplayMemory(buffer_size)

        self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq
        self.actor_target_network_update_freq = actor_target_network_update_freq
        self.critic_target_network_update_freq = critic_target_network_update_freq
        self.actor = actor_model
        self.critic = critic_model
        self.target_actor = deepcopy(actor_model)
        self.target_critic = deepcopy(critic_model)

        self.actor_critic = actor_critic(self.actor, self.critic, self.GCN)

        actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
        critic_optim = Adam(self.critic.parameters(),
                            lr=critic_lr,
                            weight_decay=critic_l2_reg)
        if decay:
            self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(
                actor_optim, decay_rate, last_epoch=-1)
            self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(
                critic_optim, decay_rate, last_epoch=-1)
        else:
            self.actor_optim = actor_optim
            self.critic_optim = critic_optim

        super(DDPG_Agent, self).__init__(path)
        #example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        #self.writer.add_graph(self.actor_critic, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = []
        self.forward_ep_show_list = []
        self.backward_ep_show_list = []
Example #10
0
class DDPG_Agent(Agent_value_based):
    def __init__(
            self,
            env,
            actor_model,
            critic_model,
            actor_lr=1e-4,
            critic_lr=1e-3,
            actor_target_network_update_freq=1000,
            critic_target_network_update_freq=1000,
            actor_training_freq=1,
            critic_training_freq=1,
            sperate_critic=False,
            ## hyper-parameter
            gamma=0.99,
            batch_size=32,
            buffer_size=50000,
            learning_starts=1000,
            ## lr_decay
            decay=False,
            decay_rate=0.9,
            critic_l2_reg=1e-2,
            clip_norm=None,
            ##
            path=None):

        self.gpu = False
        self.env = env
        self.sperate_critic = sperate_critic
        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts

        self.replay_buffer = ReplayMemory(buffer_size)

        self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq
        self.actor_target_network_update_freq = actor_target_network_update_freq
        self.critic_target_network_update_freq = critic_target_network_update_freq
        self.actor = actor_model
        self.critic = critic_model
        self.target_actor = deepcopy(actor_model)
        self.target_critic = deepcopy(critic_model)

        self.actor_critic = actor_critic(self.actor, self.critic, self.GCN)

        actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
        critic_optim = Adam(self.critic.parameters(),
                            lr=critic_lr,
                            weight_decay=critic_l2_reg)
        if decay:
            self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(
                actor_optim, decay_rate, last_epoch=-1)
            self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(
                critic_optim, decay_rate, last_epoch=-1)
        else:
            self.actor_optim = actor_optim
            self.critic_optim = critic_optim

        super(DDPG_Agent, self).__init__(path)
        #example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        #self.writer.add_graph(self.actor_critic, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = []
        self.forward_ep_show_list = []
        self.backward_ep_show_list = []

    def forward(self, observation):
        observation = observation[np.newaxis, :].astype(np.float32)
        observation = torch.from_numpy(observation)
        action = self.actor.forward(observation)
        action = torch.normal(action, torch.ones_like(action))
        if self.sperate_critic:
            Q = self.critic.forward(observation,
                                    action).squeeze().detach().numpy()
        else:
            Q = self.critic(torch.cat((observation, action),
                                      dim=1)).squeeze().detach().numpy()
        return action.cpu().squeeze(0).detach().numpy(), Q, {}

    def backward(self, sample_):
        self.replay_buffer.push(sample_)
        if self.step > self.learning_starts and self.learning:
            sample = self.replay_buffer.sample(self.batch_size)
            if self.gpu:
                for key in sample.keys():
                    sample[key] = sample[key].cuda()
            assert len(sample["s"]) == self.batch_size
            "update the critic "
            if self.step % self.critic_training_freq == 0:
                if self.sperate_critic:
                    Q = self.critic.forward(sample["s"], sample["a"])
                else:
                    input = torch.cat((sample["s"], sample["a"]), -1)
                    Q = self.critic.forward(input)
                target_a = self.target_actor(sample["s_"])
                if self.sperate_critic:
                    targetQ = self.target_critic(sample["s_"], target_a)
                else:
                    target_input = torch.cat((sample["s_"], target_a), -1)
                    targetQ = self.target_critic(target_input)
                targetQ = targetQ.squeeze(1)
                Q = Q.squeeze(1)
                expected_q_values = sample["r"] + self.gamma * targetQ * (
                    1.0 - sample["tr"])
                loss = torch.mean(huber_loss(expected_q_values - Q))
                self.critic_optim.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                               1,
                                               norm_type=2)
                self.critic_optim.step()
            "training the actor"
            if self.step % self.actor_training_freq == 0:
                Q = self.actor_critic.forward(sample["s"])
                Q = -torch.mean(Q)
                self.actor_optim.zero_grad()
                Q.backward()
                torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                               1,
                                               norm_type=2)
                self.actor_optim.step()
            if self.step % self.actor_target_network_update_freq == 0:
                self.target_actor_net_update()
            if self.step % self.critic_target_network_update_freq == 0:
                self.target_critic_net_update()
            loss = loss.data.numpy()
            return loss, {}
        return 0, {}

    def target_actor_net_update(self):
        self.target_actor.load_state_dict(self.actor.state_dict())

    def target_critic_net_update(self):
        self.target_critic.load_state_dict(self.critic.state_dict())

    def load_weights(self, filepath):
        model = torch.load(filepath)
        self.actor.load_state_dict(model["actor"])
        self.critic.load_state_dict(model["critic"])
        self.target_actor.load_state_dict(model["target_actor"])
        self.target_critic.load_state_dict(model["target_critic"])
        self.actor_optim.load_state_dict(model["actor_optim"])
        self.critic_optim.load_state_dict(model["critic_optim"])

    def save_weights(self, filepath, overwrite=False):
        torch.save(
            {
                "actor": self.actor,
                "critic": self.critic,
                "target_actor": self.target_actor,
                "target_critic": self.target_critic,
                "actor_optim": self.actor_optim,
                "critic_optim": self.critic_optim
            }, filepath + "DDPG.pkl")

    def cuda(self):
        self.actor.to_gpu()
        self.critic.to_gpu()
        self.target_actor = deepcopy(self.actor)
        self.target_critic = deepcopy(self.critic)
        self.gpu = True
Example #11
0
    def __init__(self, env, policy_model, value_model,
                 lr=1e-4, ent_coef=0.01, vf_coef=0.5,
                 ## hyper-parawmeter
                 gamma=0.99, lam=0.95, cliprange=0.2,
                 buffer_size=50000, learning_starts=1000, running_step=2048, batch_training_round=10,
                 value_regular=0.01,
                 ## decay
                 decay=False, decay_rate=0.9,
                 ##
                 path=None):

        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.cliprange = cliprange

        self.learning_starts = learning_starts
        self.batch_training_round = batch_training_round
        self.run_step = running_step
        self.sample_training_step = self.batch_training_round * self.run_step

        self.replay_buffer = ReplayMemory(buffer_size, ["value", "logp"])
        self.loss_cal = torch.nn.MSELoss()

        self.dist = make_pdtype(env.action_space, policy_model)

        self.policy_model = policy_model
        if value_model == "shared":
            self.value_model = policy_model
        elif value_model == "copy":
            self.value_model = deepcopy(policy_model)
        else:
            self.value_model = value_model

        policy_model_optim = Adam(self.policy_model.parameters(), lr=lr)
        value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular)
        if decay:
            self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate,
                                                                            last_epoch=-1)
            self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate,
                                                                             last_epoch=-1)
        else:
            self.policy_model_optim = policy_model_optim
            self.value_model_optim = value_model_optim

        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2)
        torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2)

        self.run_policy = deepcopy(self.policy_model)
        self.run_value = deepcopy(self.value_model)

        super(PPO_Agent, self).__init__(path)
        example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        self.writer.add_graph(self.policy_model, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
        self.forward_ep_show_list = []
        self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]

        self.training_round = 0
        self.training_step = 0
        self.running_step = 0
        self.record_sample = None
        self.train_ticks = np.tile(np.arange(self.run_step), self.batch_training_round)
Example #12
0
class PPO_Agent(Agent_value_based):
    def __init__(self, env, policy_model, value_model,
                 lr=1e-4, ent_coef=0.01, vf_coef=0.5,
                 ## hyper-parawmeter
                 gamma=0.99, lam=0.95, cliprange=0.2,
                 buffer_size=50000, learning_starts=1000, running_step=2048, batch_training_round=10,
                 value_regular=0.01,
                 ## decay
                 decay=False, decay_rate=0.9,
                 ##
                 path=None):

        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.cliprange = cliprange

        self.learning_starts = learning_starts
        self.batch_training_round = batch_training_round
        self.run_step = running_step
        self.sample_training_step = self.batch_training_round * self.run_step

        self.replay_buffer = ReplayMemory(buffer_size, ["value", "logp"])
        self.loss_cal = torch.nn.MSELoss()

        self.dist = make_pdtype(env.action_space, policy_model)

        self.policy_model = policy_model
        if value_model == "shared":
            self.value_model = policy_model
        elif value_model == "copy":
            self.value_model = deepcopy(policy_model)
        else:
            self.value_model = value_model

        policy_model_optim = Adam(self.policy_model.parameters(), lr=lr)
        value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular)
        if decay:
            self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate,
                                                                            last_epoch=-1)
            self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate,
                                                                             last_epoch=-1)
        else:
            self.policy_model_optim = policy_model_optim
            self.value_model_optim = value_model_optim

        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2)
        torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2)

        self.run_policy = deepcopy(self.policy_model)
        self.run_value = deepcopy(self.value_model)

        super(PPO_Agent, self).__init__(path)
        example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        self.writer.add_graph(self.policy_model, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
        self.forward_ep_show_list = []
        self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]

        self.training_round = 0
        self.training_step = 0
        self.running_step = 0
        self.record_sample = None
        self.train_ticks = np.tile(np.arange(self.run_step), self.batch_training_round)

    def forward(self, observation):
        observation = observation[np.newaxis,:].astype(np.float32)
        observation = torch.from_numpy(observation)
        with torch.no_grad():
            outcome = self.run_policy.forward(observation)
            self.pd = self.dist(outcome)
            self.action = self.pd.sample()
            self.Q = self.run_value.forward(observation)
        return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).data.numpy(), {}

    def backward(self, sample_):
        sample_["logp"] = self.pd.log_prob(self.action)
        sample_["value"] = self.Q
        self.replay_buffer.push(sample_)
        self.running_step += 1
        """"""""""""""
        "training part"
        "in each step, we train for batch batch_training_times"
        """"""""""""""
        if self.step > self.learning_starts:
            if self.running_step % self.run_step == 0 and self.training_step == 0:
                " sample advantage generate "
                with torch.no_grad():
                    sample = self.replay_buffer.recent_step_sample(self.running_step)
                    last_value = self.value_model.forward(sample["s_"][-1])
                    self.record_sample = gae(sample, last_value, self.gamma, self.lam)
                self.running_step = 0

            if self.training_step < self.sample_training_step and self.record_sample is not None:
                pg_loss_re = 0
                entropy_re = 0
                vf_loss_re = 0
                loss_re = 0
                for _ in range(self.batch_training_round):
                    index = self.train_ticks[self.training_step]
                    S = self.record_sample["s"][index].detach()
                    A = self.record_sample["a"][index].detach()
                    old_log = self.record_sample["logp"][index].detach()
                    advs = self.record_sample["advs"][index].detach()
                    value = self.record_sample["value"][index].detach()
                    returns = self.record_sample["return"][index].detach()
                    # generate Policy gradient loss
                    outcome = self.run_policy.forward(S)
                    new_policy = self.dist(outcome)
                    new_lop = new_policy.log_prob(A)
                    ratio = torch.exp(new_lop - old_log)
                    pg_loss1 = advs * ratio
                    pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
                    pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean()
                    # value loss
                    value_now = self.run_value.forward(S)
                    value_clip = value + torch.clamp(value_now - value, min=-self.cliprange,
                                                     max=self.cliprange)  # Clipped value
                    vf_loss1 = self.loss_cal(value_now, returns)  # Unclipped loss
                    vf_loss2 = self.loss_cal(value_clip, returns)  # clipped loss
                    vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
                    # vf_loss = 0.5 * vf_loss1
                    # entropy
                    entropy = new_policy.entropy().mean()
                    loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
                    # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
                    # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)

                    self.value_model_optim.zero_grad()
                    loss.backward(retain_graph=True)
                    self.value_model_optim.step()

                    self.policy_model_optim.zero_grad()
                    loss.backward()
                    self.policy_model_optim.step()

                    self.training_step += 1
                    pg_loss_re += pg_loss.data.numpy()
                    entropy_re += entropy.data.numpy()
                    vf_loss_re += vf_loss.data.numpy()
                    loss_re += loss.data.numpy()

                if self.training_step == self.sample_training_step:
                    print("the" + str(self.episode) + " round have training finished")
                    self.run_policy.load_state_dict(self.policy_model.state_dict())
                    self.run_value.load_state_dict(self.value_model.state_dict())
                    self.training_step = 0
                    self.record_sample = None
                return loss_re, {"pg_loss": pg_loss_re, "entropy": entropy_re, "vf_loss": vf_loss_re}
        return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0}

    def load_weights(self, filepath):
        model = torch.load(filepath+"ppo.pkl")
        self.policy_model.load_state_dict(model["policy_model"].state_dict())
        self.value_model.load_state_dict(model["value_model"].state_dict())

    def save_weights(self, filepath, overwrite=False):
        torch.save({"policy_model": self.policy_model,"value_model": self.value_model}, filepath + "PPO.pkl")
Example #13
0
class TD3_Agent(Agent_value_based):
    def __init__(
            self,
            env,
            actor_model,
            critic_model,
            actor_lr=1e-4,
            critic_lr=3e-4,
            actor_target_network_update_freq=0.1,
            critic_target_network_update_freq=0.1,
            actor_training_freq=2,
            critic_training_freq=1,
            ## hyper-parameter
            gamma=0.99,
            batch_size=32,
            buffer_size=50000,
            learning_starts=1000,
            ## decay
            decay=False,
            decay_rate=0.9,
            l2_regulization=0.01,
            ##
            path=None):

        self.gpu = False
        self.env = env
        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq
        self.actor_target_network_update_freq = actor_target_network_update_freq
        self.critic_target_network_update_freq = critic_target_network_update_freq

        self.replay_buffer = ReplayMemory(buffer_size)
        self.actor = actor_model
        self.critic = critic_build(critic_model)

        self.actor_critic = actor_critic(self.actor, self.critic)

        self.target_actor = deepcopy(self.actor)
        self.target_critic = deepcopy(self.critic)

        actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
        critic_optim = Adam(self.critic.parameters(),
                            lr=critic_lr,
                            weight_decay=l2_regulization)
        if decay:
            self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(
                actor_optim, decay_rate, last_epoch=-1)
            self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(
                critic_optim, decay_rate, last_epoch=-1)
        else:
            self.actor_optim = actor_optim
            self.critic_optim = critic_optim

        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1, norm_type=2)
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                       1,
                                       norm_type=2)

        super(TD3_Agent, self).__init__(path)
        example_input = Variable(
            torch.rand(100, self.env.observation_space.shape[0]))
        self.writer.add_graph(self.actor_critic, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = []
        self.forward_ep_show_list = []
        self.backward_ep_show_list = []

    def forward(self, observation):
        observation = observation.astype(np.float32)
        observation = torch.from_numpy(observation)
        action = self.actor.forward(observation)
        csv_record(action.detach().numpy(), "./")
        action = torch.normal(action, torch.ones_like(action))
        Q, _ = self.critic(torch.cat((observation, action), axis=0))
        action = action.data.numpy()
        return action, Q.detach().numpy(), {}

    def backward(self, sample_):
        self.replay_buffer.push(sample_)
        if self.step > self.learning_starts and self.learning:
            sample = self.replay_buffer.sample(self.batch_size)
            if self.gpu:
                for key in sample.keys():
                    sample[key] = sample[key].cuda()
            assert len(sample["s"]) == self.batch_size
            "update the critic "
            if self.step % self.critic_training_freq == 0:
                target_a = self.target_actor(sample["s_"])
                target_input = torch.cat((sample["s_"], target_a), -1)
                Q1, Q2 = self.target_critic(target_input)
                target_Q = torch.min(Q1, Q2)
                expected_q_values = sample["r"] + self.gamma * target_Q * (
                    1.0 - sample["tr"])

                input = torch.cat((sample["s"], sample["a"]), -1)
                Q1, Q2 = self.critic(input)
                loss = torch.mean(
                    huber_loss(expected_q_values - Q1)) + torch.mean(
                        huber_loss(expected_q_values - Q2))
                self.critic.zero_grad()
                loss.backward()
                self.critic_optim.step()
            "training the actor"
            if self.step % self.actor_training_freq == 0:
                Q = self.actor_critic(sample["s"])
                Q = -torch.mean(Q)
                self.actor.zero_grad()
                Q.backward()
                self.actor_optim.step()
            self.target_net_update()
            loss = loss.data.numpy()
            return loss, {}
        return 0, {}

    def target_net_update(self):
        if self.actor_target_network_update_freq > 1:
            if self.step % self.actor_target_network_update_freq == 0:
                self.target_actor.load_state_dict(self.actor.state_dict())
        else:
            for param, target_param in zip(self.actor.parameters(),
                                           self.target_actor.parameters()):
                target_param.data.copy_(
                    self.actor_target_network_update_freq * param.data +
                    (1 - self.actor_target_network_update_freq) *
                    target_param.data)
        if self.critic_target_network_update_freq > 1:
            if self.step % self.critic_target_network_update_freq == 0:
                self.target_critic.load_state_dict(self.critic.state_dict())
        else:
            for param, target_param in zip(self.critic.parameters(),
                                           self.target_critic.parameters()):
                target_param.data.copy_(
                    self.critic_target_network_update_freq * param.data +
                    (1 - self.critic_target_network_update_freq) *
                    target_param.data)

    def load_weights(self, filepath):
        model = torch.load(filepath + "TD3.pkl")
        self.actor.load_state_dict(model["actor"])
        self.critic.load_state_dict(model["critic"])
        self.target_actor.load_state_dict(model["target_actor"])
        self.target_critic.load_state_dict(model["target_critic"])
        self.actor_optim.load_state_dict(model["actor_optim"])
        self.critic_optim.load_state_dict(model["critic_optim"])

    def save_weights(self, filepath, overwrite=False):
        torch.save(
            {
                "actor": self.actor,
                "critic": self.critic,
                "target_actor": self.target_actor,
                "target_critic": self.target_critic,
                "actor_optim": self.actor_optim,
                "critic_optim": self.critic_optim
            }, filepath + "TD3.pkl")

    def cuda(self):
        self.actor = gpu_foward(self.actor)
        self.critic = gpu_foward(self.critic)
        self.target_actor = deepcopy(self.actor)
        self.target_critic = deepcopy(self.critic)

        self.gpu = True
Example #14
0
class PPO_Agent(Agent_value_based):
    def __init__(self, env, policy_model, value_model,
                 lr=1e-4, ent_coef=0.01, vf_coef=0.5,
                 ## hyper-parawmeter
                 gamma=0.99, lam=0.95, cliprange=0.2, batch_size = 32,
                 buffer_size=50000, learning_starts=1000, running_step="synchronization", batch_training_round=10,
                 value_regular=0.01, train_value_round = 1,
                 ## decay
                 decay=False, decay_rate=0.9,
                 ##
                 path=None):

        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.cliprange = cliprange
        self.batch_size = batch_size
        self.batch_training_round = batch_training_round
        self.learning_starts = learning_starts
        self.train_value_round = train_value_round
        if running_step =="synchronization":
            self.run_step = 1
        else:
            self.run_step = running_step


        self.replay_buffer = ReplayMemory(buffer_size)
        self.loss_cal = torch.nn.MSELoss()

        self.policy_model = policy_model
        if value_model == "shared":
            self.value_model = policy_model
        elif value_model == "copy":
            self.value_model = deepcopy(policy_model)
        else:
            self.value_model = value_model

        self.run_policy_model,self.run_value_model = deepcopy(self.policy_model), deepcopy(self.value_model)

        self.dist = make_pdtype(env.action_space, policy_model)

        policy_model_optim = Adam(self.policy_model.parameters(), lr=lr)
        value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular)
        if decay:
            self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate,
                                                                            last_epoch=-1)
            self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate,
                                                                             last_epoch=-1)
        else:
            self.policy_model_optim = policy_model_optim
            self.value_model_optim = value_model_optim

        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2)
        torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2)

        super(PPO_Agent, self).__init__(path)
        example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        self.writer.add_graph(self.policy_model, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
        self.forward_ep_show_list = []
        self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]

        self.training_round = 0
        self.running_step = 0
        self.record_sample = None
        self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []}

    def forward(self, observation):
        observation = observation[np.newaxis, :].astype(np.float32)
        observation = torch.from_numpy(observation)
        outcome = self.policy_model.forward(observation)
        self.pd = self.dist(outcome)
        self.action = self.pd.sample()
        self.Q = self.value_model.forward(observation).squeeze()
        return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).detach().numpy(), {}

    def backward(self, sample_):
        self.replay_buffer.push(sample_)
        self.running_step += 1
        """"""""""""""
        "training part"
        """"""""""""""
        if self.step > self.learning_starts and self.learning:
            if self.record_sample is None and self.running_step > self.run_step:
                print("***************************************")
                print("In the ", self.episode, "ep")
                sample = self.replay_buffer.recent_step_sample(self.running_step)
                " sample advantage generate "
                sample["value"] = self.value_model.forward(sample["s"]).squeeze()
                last_value = self.value_model.forward(sample["s_"][-1])
                self.record_sample = gae(sample, last_value, self.gamma, self.lam)
                " sample log_probabilty generate"
                outcome = self.policy_model.forward(sample["s"])
                self.pd = self.dist(outcome)
                sample["logp"] = self.pd.log_prob(sample["a"])
                self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []}
                self.running_step = 0
            if self.record_sample is not None:
                print("the learning has start...........")
                while self.training_round < self.batch_training_round:
                    start = (self.batch_size * self.training_round) % self.record_sample["s"].size()[0]
                    if start+self.batch_size >= self.record_sample["s"].size()[0]:
                        end = self.record_sample["s"].size()[0]
                    else:
                        end = start+self.batch_size
                    index = np.arange(start, end)
                    S = self.record_sample["s"][index]
                    A = self.record_sample["a"][index]
                    old_log = self.record_sample["logp"][index].detach()
                    advs = self.record_sample["advs"][index].detach()
                    value = self.record_sample["value"][index].detach()
                    returns = self.record_sample["return"][index].detach()

                    " traning the value model"

                    value_now = self.value_model.forward(S)
                    value_clip = value + torch.clamp(value_now - value, min=-self.cliprange, max=self.cliprange) # Clipped value
                    vf_loss1 = self.loss_cal(value_now, returns)   # Unclipped loss
                    vf_loss2 = self.loss_cal(value_clip, returns)  # clipped loss
                    vf_loss = .5 * torch.max(vf_loss1, vf_loss2)  # value loss
                    vf_loss = 0.5 * vf_loss1
                    " CALCULATE THE LOSS"
                    " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"

                    #generate Policy gradient loss
                    outcome = self.policy_model.forward(S)
                    new_policy = self.dist(outcome)
                    new_lop = new_policy.log_prob(A)
                    ratio = torch.exp(new_lop-old_log)
                    pg_loss1 = advs * ratio
                    pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
                    pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean()

                    # entropy
                    entropy = new_policy.entropy().mean()
                    loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

                    self.value_model_optim.zero_grad()
                    loss.backward(retain_graph=True)
                    self.value_model_optim.step()

                    self.policy_model_optim.zero_grad()
                    loss.backward()
                    self.policy_model_optim.step()


                    # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
                    # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
                    self.training_round += 1
                    print("round:", self.training_round,
                          "pg_loss:", pg_loss.data.numpy(), "entropy:", entropy.data.numpy(), "vf_loss", vf_loss.data.numpy())
                    self.loss_record["pg_loss"].append(pg_loss.data.numpy())
                    self.loss_record["entropy"].append(entropy.data.numpy())
                    self.loss_record["vf_loss"].append(vf_loss.data.numpy())
                    self.loss_record["loss"].append(loss.data.numpy())
                self.training_round = 0
                self.record_sample = None

        if self.loss_record["loss"] and self.running_step<self.batch_training_round:
            return self.loss_record["loss"][self.running_step],\
                   {"pg_loss": self.loss_record["pg_loss"][self.running_step],
                    "entropy": self.loss_record["vf_loss"][self.running_step],
                    "vf_loss": self.loss_record["loss"][self.running_step]}
        else:
            return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0}






    def load_weights(self, filepath):
        model = torch.load(filepath+"ppo.pkl")
        self.policy_model.load_state_dict(model["graph_model"].state_dict())
        self.policy_model_optim.load_state_dict(model["graph_model_optim"])
        self.value_model.load_state_dict(model["value_model"].state_dict())
        self.value_model_optim.load_state_dict(model["value_model_optim"])

    def save_weights(self, filepath, overwrite=False):
        torch.save({"policy_model": self.policy_model,"value_model": self.value_model,
                    "policy_model_optim": self.policy_model_optim,"value_model_optim": self.value_model_optim,
                    }, filepath + "PPO.pkl")