def __init__(self, env,  hidden=64):

		super().__init__()

		input_size   = env.observation_space.shape[0]
		output_size  = env.action_space.shape[0]

		self.low = Tensor(env.action_space.low)
		self.high = Tensor(env.action_space.high)

		self.action_1 = nn.Linear(input_size, hidden)
		self.action_2 = nn.Linear(hidden, hidden)
		self.mean = nn.Linear(hidden, output_size)

		self.value_1 = nn.Linear(input_size, hidden)
		self.value_2 = nn.Linear(hidden, hidden)
		self.value = nn.Linear(hidden, 1)

		for name, para in self.named_parameters():
			if "weight" in name:
				nn.init.kaiming_normal_( para , mode='fan_out', nonlinearity='tanh')
			else:
				para.data.fill_( 0 )

		self.log_std = nn.Parameter(torch.zeros(1, output_size))
		
		self.policy_params = [self.action_1, self.action_2, self.mean, self.log_std]
    def _generate_one_episode(self, env, model):
        """
        generate one episode data and save them on memory
        """
        total_reward = 0

        observations, actions, rewards, values = [], [], [], []

        observation = env.reset()

        current_time_step = 0

        while current_time_step <= self.max_episode_time_step:

            observations.append(observation)

            with torch.no_grad():
                observation_tensor = Tensor(observation).unsqueeze(0)
                probs, value = model(observation_tensor)

            probs = F.softmax(probs, dim=1)
            act_dis = Categorical(probs)

            try:
                action = act_dis.sample()
            except RuntimeError:
                print(probs)
                probs, value = model(observation_tensor)
                print(probs)

            action = action.cpu().numpy()
            actions.append(action)

            observation, reward, done, _ = env.step(action[0])

            values.append(value.item())
            rewards.append(reward)
            total_reward += reward
            if done:
                break

            current_time_step += 1

        last_value = 0
        if not done:
            observation_tensor = Tensor(observation).unsqueeze(0)
            with torch.no_grad():
                _, last_value = model(observation_tensor)
            last_value = last_value.item()

        advantages, estimate_returens = self.reward_processor(
            rewards, values, last_value)

        return observations, actions, advantages, estimate_returens, total_reward, current_time_step
Example #3
0
    def _optimize(self, obs, acts, advs, est_rs):

        self.optim.zero_grad()

        obs = Tensor(obs)
        acts = Tensor(acts)
        advs = Tensor(advs).unsqueeze(1)
        est_rs = Tensor(est_rs).unsqueeze(1)

        if self.continuous:
            mean, std, values = self.model(obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

        else:

            probs, values = self.model(obs)

            dis = F.softmax(probs, dim=1)

            acts = acts.long()

            dis = Categorical(probs)

            log_prob = dis.log_prob(acts).unsqueeze(1)

            ent = dis.entropy()

        # Normalize the advantage
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        policy_loss = -log_prob * advs
        policy_loss = policy_loss.mean() - self.entropy_para * ent.mean()

        criterion = nn.MSELoss()
        critic_loss = criterion(values, est_rs)

        self.writer.add_scalar("Training/Critic_Loss", critic_loss.item(),
                               self.step_count)
        loss = policy_loss + self.value_loss_coeff * critic_loss

        loss.backward()

        self.optim.step()

        self.step_count += 1
    def _generate_one_episode(env, model, horizon, reward_processor):
        """
        generate one episode data and save them on memory
        """
        total_reward = 0

        observations, actions, rewards, values = [], [], [], []

        observation = env.reset()

        current_time_step = 0

        while current_time_step <= horizon:

            observations.append(observation)

            with torch.no_grad():
                observation_tensor = Tensor(observation).unsqueeze(0)
                mean, std, value = model(observation_tensor)

            act_dis = Normal(mean, std)
            action = act_dis.sample()
            action = action.squeeze(0).cpu().numpy()
            actions.append(action)

            observation, reward, done, _ = env.step(action)
            # print(reward)
            values.append(value.item())
            rewards.append(reward)
            total_reward += reward
            if done:
                break

            current_time_step += 1

        last_value = 0
        if not done:
            observation_tensor = Tensor(observation).unsqueeze(0)
            with torch.no_grad():
                _, _, last_value = model(observation_tensor)
            last_value = last_value.item()

        advantages, estimate_returens = reward_processor(
            rewards, values, last_value)

        return (observations, actions, advantages, estimate_returens,
                total_reward, current_time_step)
Example #5
0
 def sample_action_from_policy(self, observation):
     """
 Given an observation, return the action sampled from the policy model as well as the probabilities associated with each action
 """
     observation_tensor = Tensor(observation).unsqueeze(0)
     probabilities = self.policy_model(
         Variable(observation_tensor, requires_grad=True))
     action = probabilities.multinomial(1)
     return action, probabilities
Example #6
0
 def mean_kl_divergence(self, model):
   """
   Returns an estimate of the average KL divergence between a given model and self.policy_model
   """
   observations_tensor = torch.cat(
       [Variable(Tensor(observation)).unsqueeze(0) for observation in self.observations])
   actprob = model(observations_tensor).detach() + 1e-8
   old_actprob = self.policy_model(observations_tensor)
   return torch.sum(old_actprob * torch.log(old_actprob / actprob), 1).mean()
Example #7
0
 def surrogate_loss(self, theta):
   """
   Returns the surrogate loss w.r.t. the given parameter vector theta
   """
   new_model = copy.deepcopy(self.policy_model)
   vector_to_parameters(theta, new_model.parameters())
   observations_tensor = torch.cat(
       [Variable(Tensor(observation)).unsqueeze(0) for observation in self.observations])
   prob_new = new_model(observations_tensor).gather(
       1, torch.cat(self.actions)).data
   prob_old = self.policy_model(observations_tensor).gather(
       1, torch.cat(self.actions)).data + 1e-8
   return -torch.mean((prob_new / prob_old) * self.advantage)
    def _optimize(self, obs, acts, rews):

        self.optim.zero_grad()

        obs = Tensor(obs)
        acts = Tensor(acts)
        rews = Tensor(rews).unsqueeze(1)

        if self.continuous:
            mean, std = self.model(obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

        else:

            probs = self.model(obs)

            dis = F.softmax(probs, dim=1)
            dis = Categorical(dis)

            acts = acts.long()

            log_prob = dis.log_prob(acts)
            ent = dis.entropy()

        rews = (rews - rews.mean()) / (rews.std() + 1e-8)

        actor_loss = -log_prob * rews

        actor_loss = actor_loss.mean() - self.entropy_para * ent.mean()

        actor_loss.backward()

        self.optim.step()
    def _optimize(self, obs, acts, advs, est_rs):

        self.obs, self.acts, self.advs, self.est_rs = obs, acts, advs, est_rs

        self.obs = Tensor(self.obs)
        self.acts = Tensor(self.acts)
        self.advs = Tensor(self.advs).unsqueeze(1)
        self.est_rs = Tensor(self.est_rs).unsqueeze(1)

        # Calculate Advantage & Normalize it
        self.advs = (self.advs - self.advs.mean()) / (self.advs.std() + 1e-8)

        # Surrogate loss with Entropy

        if self.continuous:
            mean, std, values = self.model(self.obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(self.acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

            probs_new = torch.exp(log_prob)
            probs_old = probs_new.detach() + 1e-8

        else:

            probs, values = self.model(self.obs)

            dis = F.softmax(probs, dim=1)

            self.acts = self.acts.long()

            probs_new = dis.gather(1, self.acts)
            probs_old = probs_new + 1e-8

            ent = -(dis.log() * dis).sum(-1)

        ratio = probs_new / probs_old

        surrogate_loss = -torch.mean(
            ratio * self.advs) - self.entropy_para * ent.mean()

        # criterion = torch.nn.MSELoss()
        # empty_value_loss = criterion( values, values.detach() )

        # Calculate the gradient of the surrogate loss
        self.model.zero_grad()
        surrogate_loss.backward()
        policy_gradient = parameters_to_vector([
            p.grad for p in self.model.policy_parameters()
        ]).squeeze(0).detach()

        # ensure gradient is not zero
        if policy_gradient.nonzero().size()[0]:
            # Use Conjugate gradient to calculate step direction
            step_direction = self.conjugate_gradient(-policy_gradient)
            # line search for step
            shs = .5 * step_direction.dot(
                self.hessian_vector_product(step_direction))

            lm = torch.sqrt(shs / self.max_kl)
            fullstep = step_direction / lm

            gdotstepdir = -policy_gradient.dot(step_direction)
            theta = self.linesearch(
                parameters_to_vector(self.model.policy_parameters()).detach(),
                fullstep, gdotstepdir / lm)
            # Update parameters of policy model
            old_model = copy.deepcopy(self.model)
            old_model.load_state_dict(self.model.state_dict())

            if any(np.isnan(theta.cpu().detach().numpy())):
                print("NaN detected. Skipping update...")
            else:
                # for param in self.model.policy_parameters():
                #     print(param)
                vector_to_parameters(theta, self.model.policy_parameters())

            kl_old_new = self.mean_kl_divergence(old_model)
            print('KL:{:10} , Entropy:{:10}'.format(kl_old_new.item(),
                                                    ent.mean().item()))

        else:
            print("Policy gradient is 0. Skipping update...")
            print(policy_gradient.shape)

        self.model.zero_grad()

        if self.continuous:
            _, _, values = self.model(self.obs)
        else:
            _, values = self.model(self.obs)

        criterion = torch.nn.MSELoss()
        critic_loss = self.value_loss_coeff * criterion(values, self.est_rs)
        critic_loss.backward()
        self.optim.step()
        print("MSELoss for Value Net:{}".format(critic_loss.item()))
Example #10
0
    def step(self):
        """
    Executes an iteration of TRPO
    """
        # Generate rollout
        all_observations, all_discounted_rewards, total_reward, all_actions, all_action_dists, self.entropy = self.sample_trajectories(
        )

        num_batches = len(all_actions) / self.batch_size if len(
            all_actions) % self.batch_size == 0 else (len(all_actions) /
                                                      self.batch_size) + 1
        for batch_num in range(num_batches):
            print("Processing batch number {}".format(batch_num + 1))
            self.observations = all_observations[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]
            self.discounted_rewards = all_discounted_rewards[batch_num *
                                                             self.batch_size:
                                                             (batch_num + 1) *
                                                             self.batch_size]
            self.actions = all_actions[batch_num *
                                       self.batch_size:(batch_num + 1) *
                                       self.batch_size]
            self.action_dists = all_action_dists[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]

            # Calculate the advantage of each step by taking the actual discounted rewards seen
            # and subtracting the estimated value of each state
            baseline = self.value_function_model.predict(
                self.observations).data
            discounted_rewards_tensor = Tensor(
                self.discounted_rewards).unsqueeze(1)
            advantage = discounted_rewards_tensor - baseline

            # Normalize the advantage
            self.advantage = (advantage -
                              advantage.mean()) / (advantage.std() + 1e-8)

            # Calculate the surrogate loss as the elementwise product of the advantage and the probability ratio of actions taken
            new_p = torch.cat(self.action_dists).gather(
                1, torch.cat(self.actions))
            old_p = new_p.detach() + 1e-8
            prob_ratio = new_p / old_p
            surrogate_loss = -torch.mean(prob_ratio * Variable(
                self.advantage)) - (self.ent_coeff * self.entropy)

            # Calculate the gradient of the surrogate loss
            self.policy_model.zero_grad()
            surrogate_loss.backward(retain_graph=True)
            policy_gradient = parameters_to_vector(
                [v.grad for v in self.policy_model.parameters()]).squeeze(0)

            if policy_gradient.nonzero().size()[0]:
                # Use conjugate gradient algorithm to determine the step direction in theta space
                step_direction = self.conjugate_gradient(-policy_gradient)
                step_direction_variable = Variable(
                    torch.from_numpy(step_direction))

                # Do line search to determine the stepsize of theta in the direction of step_direction
                shs = .5 * step_direction.dot(
                    self.hessian_vector_product(
                        step_direction_variable).cpu().numpy().T)
                lm = np.sqrt(shs / self.max_kl)
                fullstep = step_direction / lm
                gdotstepdir = -policy_gradient.dot(
                    step_direction_variable).data[0]
                theta = self.linesearch(
                    parameters_to_vector(self.policy_model.parameters()),
                    fullstep, gdotstepdir / lm)

                # Fit the estimated value function to the actual observed discounted rewards
                ev_before = math_utils.explained_variance_1d(
                    baseline.squeeze(1).cpu().numpy(), self.discounted_rewards)
                self.value_function_model.zero_grad()
                value_fn_params = parameters_to_vector(
                    self.value_function_model.parameters())
                self.value_function_model.fit(
                    self.observations,
                    Variable(Tensor(self.discounted_rewards)))
                ev_after = math_utils.explained_variance_1d(
                    self.value_function_model.predict(
                        self.observations).data.squeeze(1).cpu().numpy(),
                    self.discounted_rewards)
                if ev_after < ev_before or np.abs(ev_after) < 1e-4:
                    vector_to_parameters(
                        value_fn_params,
                        self.value_function_model.parameters())

                # Update parameters of policy model
                old_model = copy.deepcopy(self.policy_model)
                old_model.load_state_dict(self.policy_model.state_dict())
                if any(np.isnan(theta.data.cpu().numpy())):
                    print("NaN detected. Skipping update...")
                else:
                    vector_to_parameters(theta, self.policy_model.parameters())

                kl_old_new = self.mean_kl_divergence(old_model)
                diagnostics = collections.OrderedDict([
                    ('Total Reward', total_reward),
                    ('KL Old New', kl_old_new.data[0]),
                    ('Entropy', self.entropy.data[0]),
                    ('EV Before', ev_before), ('EV After', ev_after)
                ])
                for key, value in diagnostics.iteritems():
                    print("{}: {}".format(key, value))

            else:
                print("Policy gradient is 0. Skipping update...")

        return total_reward
Example #11
0
    def _optimize(self, obs, acts, advs, est_rs):

        self.optim.zero_grad()
        
        obs  = Tensor( obs )
        acts = Tensor( acts )
        advs = Tensor( advs ).unsqueeze(1)
        est_rs = Tensor( est_rs ).unsqueeze(1)

        if self.continuous:
            mean, std, values = self.model( obs )
            with torch.no_grad():
                mean_old, std_old, _ = self.model_old( obs )

            dis = Normal(mean, std)
            dis_old = Normal(mean_old, std_old)
            
            log_prob     = dis.log_prob( acts ).sum( -1, keepdim=True )
            log_prob_old = dis_old.log_prob( acts ).sum( -1, keepdim=True )

            ent = dis.entropy().sum( -1, keepdim=True )

            ratio = torch.exp(log_prob - log_prob_old)            

        else:

            probs, values = self.model(obs)
            with torch.no_grad():
                probs_old, _ = self.model_old(obs)

            dis = F.softmax( probs, dim = 1 )
            dis_old = F.softmax(probs_old, dim = 1 )

            acts = acts.long()

            probs = dis.gather( 1, acts )
            probs_old = dis_old.gather( 1, acts )

            # dis = Categorical( probs )
            # dis_old = Categorical( probs_old )
            ratio = probs / ( probs_old + 1e-8 )            

            # log_prob     = dis.log_prob( acts ).unsqueeze(1)
            # log_prob_old = dis_old.log_prob( acts ).unsqueeze(1)

            ent = -( dis.log() * dis ).sum(-1)


        # Normalize the advantage
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        surrogate_loss_pre_clip = ratio * advs
        surrogate_loss_clip = torch.clamp(ratio, 
                        1.0 - self.clip_para,
                        1.0 + self.clip_para) * advs

        # print("ratio min:{} max:{}".format(ratio.detach().min().item(), ratio.detach().max().item()))

        surrogate_loss = -torch.mean(torch.min(surrogate_loss_clip, surrogate_loss_pre_clip))

        policy_loss = surrogate_loss - self.entropy_para * ent.mean()

        criterion = nn.MSELoss()
        critic_loss = criterion( values, est_rs )
        # print("Critic Loss:{}".format(critic_loss.item()))

        self.writer.add_scalar( "Training/Critic_Loss", critic_loss.item(), self.step_count )
        loss = policy_loss + self.value_loss_coeff * critic_loss

        loss.backward()

        self.optim.step()

        self.step_count += 1