Example #1
0
class PPO_Agent(Agent_value_based):
    def __init__(self, env, policy_model, value_model,
                 lr=1e-4, ent_coef=0.01, vf_coef=0.5,
                 ## hyper-parawmeter
                 gamma=0.99, lam=0.95, cliprange=0.2,
                 buffer_size=50000, learning_starts=1000, running_step=2048, batch_training_round=10,
                 value_regular=0.01,
                 ## decay
                 decay=False, decay_rate=0.9,
                 ##
                 path=None):

        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.cliprange = cliprange

        self.learning_starts = learning_starts
        self.batch_training_round = batch_training_round
        self.run_step = running_step
        self.sample_training_step = self.batch_training_round * self.run_step

        self.replay_buffer = ReplayMemory(buffer_size, ["value", "logp"])
        self.loss_cal = torch.nn.MSELoss()

        self.dist = make_pdtype(env.action_space, policy_model)

        self.policy_model = policy_model
        if value_model == "shared":
            self.value_model = policy_model
        elif value_model == "copy":
            self.value_model = deepcopy(policy_model)
        else:
            self.value_model = value_model

        policy_model_optim = Adam(self.policy_model.parameters(), lr=lr)
        value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular)
        if decay:
            self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate,
                                                                            last_epoch=-1)
            self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate,
                                                                             last_epoch=-1)
        else:
            self.policy_model_optim = policy_model_optim
            self.value_model_optim = value_model_optim

        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2)
        torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2)

        self.run_policy = deepcopy(self.policy_model)
        self.run_value = deepcopy(self.value_model)

        super(PPO_Agent, self).__init__(path)
        example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        self.writer.add_graph(self.policy_model, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
        self.forward_ep_show_list = []
        self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]

        self.training_round = 0
        self.training_step = 0
        self.running_step = 0
        self.record_sample = None
        self.train_ticks = np.tile(np.arange(self.run_step), self.batch_training_round)

    def forward(self, observation):
        observation = observation[np.newaxis,:].astype(np.float32)
        observation = torch.from_numpy(observation)
        with torch.no_grad():
            outcome = self.run_policy.forward(observation)
            self.pd = self.dist(outcome)
            self.action = self.pd.sample()
            self.Q = self.run_value.forward(observation)
        return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).data.numpy(), {}

    def backward(self, sample_):
        sample_["logp"] = self.pd.log_prob(self.action)
        sample_["value"] = self.Q
        self.replay_buffer.push(sample_)
        self.running_step += 1
        """"""""""""""
        "training part"
        "in each step, we train for batch batch_training_times"
        """"""""""""""
        if self.step > self.learning_starts:
            if self.running_step % self.run_step == 0 and self.training_step == 0:
                " sample advantage generate "
                with torch.no_grad():
                    sample = self.replay_buffer.recent_step_sample(self.running_step)
                    last_value = self.value_model.forward(sample["s_"][-1])
                    self.record_sample = gae(sample, last_value, self.gamma, self.lam)
                self.running_step = 0

            if self.training_step < self.sample_training_step and self.record_sample is not None:
                pg_loss_re = 0
                entropy_re = 0
                vf_loss_re = 0
                loss_re = 0
                for _ in range(self.batch_training_round):
                    index = self.train_ticks[self.training_step]
                    S = self.record_sample["s"][index].detach()
                    A = self.record_sample["a"][index].detach()
                    old_log = self.record_sample["logp"][index].detach()
                    advs = self.record_sample["advs"][index].detach()
                    value = self.record_sample["value"][index].detach()
                    returns = self.record_sample["return"][index].detach()
                    # generate Policy gradient loss
                    outcome = self.run_policy.forward(S)
                    new_policy = self.dist(outcome)
                    new_lop = new_policy.log_prob(A)
                    ratio = torch.exp(new_lop - old_log)
                    pg_loss1 = advs * ratio
                    pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
                    pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean()
                    # value loss
                    value_now = self.run_value.forward(S)
                    value_clip = value + torch.clamp(value_now - value, min=-self.cliprange,
                                                     max=self.cliprange)  # Clipped value
                    vf_loss1 = self.loss_cal(value_now, returns)  # Unclipped loss
                    vf_loss2 = self.loss_cal(value_clip, returns)  # clipped loss
                    vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
                    # vf_loss = 0.5 * vf_loss1
                    # entropy
                    entropy = new_policy.entropy().mean()
                    loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
                    # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
                    # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)

                    self.value_model_optim.zero_grad()
                    loss.backward(retain_graph=True)
                    self.value_model_optim.step()

                    self.policy_model_optim.zero_grad()
                    loss.backward()
                    self.policy_model_optim.step()

                    self.training_step += 1
                    pg_loss_re += pg_loss.data.numpy()
                    entropy_re += entropy.data.numpy()
                    vf_loss_re += vf_loss.data.numpy()
                    loss_re += loss.data.numpy()

                if self.training_step == self.sample_training_step:
                    print("the" + str(self.episode) + " round have training finished")
                    self.run_policy.load_state_dict(self.policy_model.state_dict())
                    self.run_value.load_state_dict(self.value_model.state_dict())
                    self.training_step = 0
                    self.record_sample = None
                return loss_re, {"pg_loss": pg_loss_re, "entropy": entropy_re, "vf_loss": vf_loss_re}
        return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0}

    def load_weights(self, filepath):
        model = torch.load(filepath+"ppo.pkl")
        self.policy_model.load_state_dict(model["policy_model"].state_dict())
        self.value_model.load_state_dict(model["value_model"].state_dict())

    def save_weights(self, filepath, overwrite=False):
        torch.save({"policy_model": self.policy_model,"value_model": self.value_model}, filepath + "PPO.pkl")
Example #2
0
class PPO_Agent(Agent_value_based):
    def __init__(self, env, policy_model, value_model,
                 lr=1e-4, ent_coef=0.01, vf_coef=0.5,
                 ## hyper-parawmeter
                 gamma=0.99, lam=0.95, cliprange=0.2, batch_size = 32,
                 buffer_size=50000, learning_starts=1000, running_step="synchronization", batch_training_round=10,
                 value_regular=0.01, train_value_round = 1,
                 ## decay
                 decay=False, decay_rate=0.9,
                 ##
                 path=None):

        self.env = env
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.cliprange = cliprange
        self.batch_size = batch_size
        self.batch_training_round = batch_training_round
        self.learning_starts = learning_starts
        self.train_value_round = train_value_round
        if running_step =="synchronization":
            self.run_step = 1
        else:
            self.run_step = running_step


        self.replay_buffer = ReplayMemory(buffer_size)
        self.loss_cal = torch.nn.MSELoss()

        self.policy_model = policy_model
        if value_model == "shared":
            self.value_model = policy_model
        elif value_model == "copy":
            self.value_model = deepcopy(policy_model)
        else:
            self.value_model = value_model

        self.run_policy_model,self.run_value_model = deepcopy(self.policy_model), deepcopy(self.value_model)

        self.dist = make_pdtype(env.action_space, policy_model)

        policy_model_optim = Adam(self.policy_model.parameters(), lr=lr)
        value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular)
        if decay:
            self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate,
                                                                            last_epoch=-1)
            self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate,
                                                                             last_epoch=-1)
        else:
            self.policy_model_optim = policy_model_optim
            self.value_model_optim = value_model_optim

        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2)
        torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2)

        super(PPO_Agent, self).__init__(path)
        example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
        self.writer.add_graph(self.policy_model, input_to_model=example_input)
        self.forward_step_show_list = []
        self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
        self.forward_ep_show_list = []
        self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]

        self.training_round = 0
        self.running_step = 0
        self.record_sample = None
        self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []}

    def forward(self, observation):
        observation = observation[np.newaxis, :].astype(np.float32)
        observation = torch.from_numpy(observation)
        outcome = self.policy_model.forward(observation)
        self.pd = self.dist(outcome)
        self.action = self.pd.sample()
        self.Q = self.value_model.forward(observation).squeeze()
        return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).detach().numpy(), {}

    def backward(self, sample_):
        self.replay_buffer.push(sample_)
        self.running_step += 1
        """"""""""""""
        "training part"
        """"""""""""""
        if self.step > self.learning_starts and self.learning:
            if self.record_sample is None and self.running_step > self.run_step:
                print("***************************************")
                print("In the ", self.episode, "ep")
                sample = self.replay_buffer.recent_step_sample(self.running_step)
                " sample advantage generate "
                sample["value"] = self.value_model.forward(sample["s"]).squeeze()
                last_value = self.value_model.forward(sample["s_"][-1])
                self.record_sample = gae(sample, last_value, self.gamma, self.lam)
                " sample log_probabilty generate"
                outcome = self.policy_model.forward(sample["s"])
                self.pd = self.dist(outcome)
                sample["logp"] = self.pd.log_prob(sample["a"])
                self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []}
                self.running_step = 0
            if self.record_sample is not None:
                print("the learning has start...........")
                while self.training_round < self.batch_training_round:
                    start = (self.batch_size * self.training_round) % self.record_sample["s"].size()[0]
                    if start+self.batch_size >= self.record_sample["s"].size()[0]:
                        end = self.record_sample["s"].size()[0]
                    else:
                        end = start+self.batch_size
                    index = np.arange(start, end)
                    S = self.record_sample["s"][index]
                    A = self.record_sample["a"][index]
                    old_log = self.record_sample["logp"][index].detach()
                    advs = self.record_sample["advs"][index].detach()
                    value = self.record_sample["value"][index].detach()
                    returns = self.record_sample["return"][index].detach()

                    " traning the value model"

                    value_now = self.value_model.forward(S)
                    value_clip = value + torch.clamp(value_now - value, min=-self.cliprange, max=self.cliprange) # Clipped value
                    vf_loss1 = self.loss_cal(value_now, returns)   # Unclipped loss
                    vf_loss2 = self.loss_cal(value_clip, returns)  # clipped loss
                    vf_loss = .5 * torch.max(vf_loss1, vf_loss2)  # value loss
                    vf_loss = 0.5 * vf_loss1
                    " CALCULATE THE LOSS"
                    " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"

                    #generate Policy gradient loss
                    outcome = self.policy_model.forward(S)
                    new_policy = self.dist(outcome)
                    new_lop = new_policy.log_prob(A)
                    ratio = torch.exp(new_lop-old_log)
                    pg_loss1 = advs * ratio
                    pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
                    pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean()

                    # entropy
                    entropy = new_policy.entropy().mean()
                    loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

                    self.value_model_optim.zero_grad()
                    loss.backward(retain_graph=True)
                    self.value_model_optim.step()

                    self.policy_model_optim.zero_grad()
                    loss.backward()
                    self.policy_model_optim.step()


                    # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
                    # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
                    self.training_round += 1
                    print("round:", self.training_round,
                          "pg_loss:", pg_loss.data.numpy(), "entropy:", entropy.data.numpy(), "vf_loss", vf_loss.data.numpy())
                    self.loss_record["pg_loss"].append(pg_loss.data.numpy())
                    self.loss_record["entropy"].append(entropy.data.numpy())
                    self.loss_record["vf_loss"].append(vf_loss.data.numpy())
                    self.loss_record["loss"].append(loss.data.numpy())
                self.training_round = 0
                self.record_sample = None

        if self.loss_record["loss"] and self.running_step<self.batch_training_round:
            return self.loss_record["loss"][self.running_step],\
                   {"pg_loss": self.loss_record["pg_loss"][self.running_step],
                    "entropy": self.loss_record["vf_loss"][self.running_step],
                    "vf_loss": self.loss_record["loss"][self.running_step]}
        else:
            return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0}






    def load_weights(self, filepath):
        model = torch.load(filepath+"ppo.pkl")
        self.policy_model.load_state_dict(model["graph_model"].state_dict())
        self.policy_model_optim.load_state_dict(model["graph_model_optim"])
        self.value_model.load_state_dict(model["value_model"].state_dict())
        self.value_model_optim.load_state_dict(model["value_model_optim"])

    def save_weights(self, filepath, overwrite=False):
        torch.save({"policy_model": self.policy_model,"value_model": self.value_model,
                    "policy_model_optim": self.policy_model_optim,"value_model_optim": self.value_model_optim,
                    }, filepath + "PPO.pkl")