def __init__(self, num_inputs, hidden_size, action_space, lr_pi = 3e-4,\
                 lr_vf = 1e-3, baseline = False, gamma = 0.99, train_v_iters = 1):

        self.gamma = gamma
        self.action_space = action_space
        self.policy = Softmax_Policy(num_inputs, hidden_size, action_space)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr = lr_pi)
        self.baseline = baseline
        self.train_v_iters = train_v_iters # how many times you want loop training of value network
        self.critic = ValueNetwork(num_inputs, hidden_size)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = lr_vf)
    def __init__(self, num_inputs, hidden_size, action_space, lr_pi = 3e-4,\
                 lr_vf = 1e-3, baseline = False, gamma = 0.99, train_v_iters = 1):

        self.gamma = gamma
        self.action_space = action_space
        self.policy = Gaussian_Policy(
            num_inputs, hidden_size, action_space
        )  # use a different policy depending on the action space being continuous or note.
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=lr_pi)
        self.baseline = baseline
        self.train_v_iters = train_v_iters  # how many times you want loop training of value network
        self.critic = ValueNetwork(num_inputs, hidden_size)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_vf)
Example #3
0
    def __init__(self, num_inputs, hidden_size, action_space, lr_pi = 3e-4,\
                 lr_vf = 1e-3, baseline = False, gamma = 0.99, train_v_iters = 1):

        self.gamma = gamma
        self.action_space = action_space
        self.policy = Gaussian_Policy(num_inputs, hidden_size, action_space)# use a different policy depending on the action space being continuous or note.
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr = lr_pi)
        self.baseline = baseline
        self.train_v_iters = train_v_iters # how many times you want to run update loop.

        # create value network if we want to use baseline/advantage
        if self.baseline:

            self.value_function = ValueNetwork(num_inputs, hidden_size)
            self.value_optimizer = optim.Adam(self.value_function.parameters(), lr = lr_vf)
Example #4
0
class REINFORCE:
    '''
    Implementation of the basic online reinforce algorithm for Gaussian policies.
    '''

    def __init__(self, num_inputs, hidden_size, action_space, lr_pi = 3e-4,\
                 lr_vf = 1e-3, baseline = False, gamma = 0.99, train_v_iters = 1):

        self.gamma = gamma
        self.action_space = action_space
        self.policy = Gaussian_Policy(num_inputs, hidden_size, action_space)# use a different policy depending on the action space being continuous or note.
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr = lr_pi)
        self.baseline = baseline
        self.train_v_iters = train_v_iters # how many times you want to run update loop.

        # create value network if we want to use baseline/advantage
        if self.baseline:

            self.value_function = ValueNetwork(num_inputs, hidden_size)
            self.value_optimizer = optim.Adam(self.value_function.parameters(), lr = lr_vf)


    def select_action(self,state):

        state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
        # get mean and std
        mean, std = self.policy(state)

        # create normal distribution
        normal = Normal(mean, std)

        # sample action
        action = normal.sample()

        # get log prob of that action
        ln_prob = normal.log_prob(action)
        ln_prob = ln_prob.sum()
	# squeeze action into [-1,1]
        action = torch.tanh(action)
        # turn actions into numpy array
        action = action.numpy()

        return action[0], ln_prob #, mean, std

    def train(self, trajectory):

        '''
        The training is done using the rewards-to-go formulation of the policy gradient update of Reinforce.
        If we are using a baseline, the value network is also trained.
        trajectory: a list of the form [( state , action , lnP(a_t|s_t), reward ), ...  ]
        '''

        log_probs = [item[2] for item in trajectory]
        rewards = [item[3] for item in trajectory]
        states = [item[0] for item in trajectory]
        actions = [item[1] for item in trajectory]

	#calculate rewards to go
        R = 0
        returns = []
        for r in rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)

        returns = torch.tensor(returns)

        # train the Value Network and calculate Advantage
        if self.baseline:

            # loop over this a couple of times
            for _ in range(self.train_v_iters):
                # calculate loss of value function using mean squared error
                value_estimates = []
                for state in states:
                    state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
                    value_estimates.append( self.value_function(state) )

                value_estimates = torch.stack(value_estimates).squeeze() # rewards to go for each step of env trajectory

                v_loss = F.mse_loss(value_estimates, returns)
                # update the weights
                self.value_optimizer.zero_grad()
                v_loss.backward()
                self.value_optimizer.step()

            # calculate advantage
            advantage = []
            for value, R in zip(value_estimates, returns):
                advantage.append(R - value)

            advantage = torch.Tensor(advantage)

            # caluclate policy loss
            policy_loss = []
            for log_prob, adv in zip(log_probs, advantage):
                policy_loss.append( - log_prob * adv)


        else:
            policy_loss = []
            for log_prob, R in zip(log_probs, returns):
                policy_loss.append( - log_prob * R)


        policy_loss = torch.stack( policy_loss ).sum()
        # update policy weights
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()


        if self.baseline:
            return policy_loss, v_loss

        else:
            return policy_loss
class Actor_Critic_discrete:
    '''
    Implementation of the basic one step actor-critic algorithm for discrete
    action spaces with baseline
    '''
    def __init__(self, num_inputs, hidden_size, action_space, lr_pi = 3e-4,\
                 lr_vf = 1e-3, baseline = False, gamma = 0.99, train_v_iters = 1):

        self.gamma = gamma
        self.action_space = action_space
        self.policy = Softmax_Policy(num_inputs, hidden_size, action_space)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr = lr_pi)
        self.baseline = baseline
        self.train_v_iters = train_v_iters # how many times you want loop training of value network
        self.critic = ValueNetwork(num_inputs, hidden_size)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = lr_vf)

    def select_action(self,state):

        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.policy(state)
        m = Categorical(probs)
        action = m.sample()
        log_prob = m.log_prob(action)

        return action.item(), log_prob


    def train(self, trajectory):
        '''
        Training is done through a boostrap estimate using a value network.
        trajectory: a list of the form [( state , action , lnP(a_t|s_t), reward ), ...
        ]'''

        log_probs = [item[2] for item in trajectory]
        rewards = [item[3] for item in trajectory]
        states = [item[0] for item in trajectory]
        actions = [item[1] for item in trajectory]

        value_estimates = []
        for state in states:
            state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj
            value_estimates.append( self.critic(state) )

        next_state_estimates = []
        for indx in range(1, len(value_estimates) ):
            next_state_estimates.append(np.asscalar(value_estimates[indx].detach().numpy()))

        next_state_estimates.append(0)
        value_estimates = torch.stack(value_estimates).squeeze()

        boostrap_estimate = []
        for indx in range(len(rewards)):
            G = rewards[indx] + self.gamma*next_state_estimates[indx]
            boostrap_estimate.append(G)
        boostrap_estimate = torch.Tensor(boostrap_estimate)

#        ipdb.set_trace()
        # train the Value Network and calculate Advantage
        if self.baseline:

            # calculate advantage
            advantage = []
            for value, R in zip(value_estimates, boostrap_estimate):
                advantage.append(R - value)

            advantage = torch.Tensor(advantage)

            # caluclate policy loss
            policy_loss = []
            for log_prob, adv in zip(log_probs, advantage):
                policy_loss.append( - log_prob * adv)

        else:
            policy_loss = []
            for log_prob, R in zip(log_probs, boostrap_estimate):
                policy_loss.append( - log_prob * R)

        # update value network
        for _ in range(self.train_v_iters):

            v_loss = F.mse_loss(value_estimates, boostrap_estimate)
            # update the weights
            self.critic_optimizer.zero_grad()
            v_loss.backward()
            self.critic_optimizer.step()

        # update policy network
        policy_loss = torch.stack( policy_loss ).sum()
        # update policy weights
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()


        return policy_loss, v_loss
class Actor_Critic:
    '''
    Implementation of the basic one step actor-critic algorithm for Gaussian
    policies with baseline
    '''

    def __init__(self, num_inputs, hidden_size, action_space, lr_pi = 3e-4,\
                 lr_vf = 1e-3, baseline = False, gamma = 0.99, train_v_iters = 1):

        self.gamma = gamma
        self.action_space = action_space
        self.policy = Gaussian_Policy(
            num_inputs, hidden_size, action_space
        )  # use a different policy depending on the action space being continuous or note.
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=lr_pi)
        self.baseline = baseline
        self.train_v_iters = train_v_iters  # how many times you want loop training of value network
        self.critic = ValueNetwork(num_inputs, hidden_size)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_vf)

    def select_action(self, state):

        state = torch.from_numpy(state).float().unsqueeze(
            0)  # just to make it a Tensor obj
        # get mean and std
        mean, std = self.policy(state)

        # create normal distribution
        normal = Normal(mean, std)

        # sample action
        action = normal.sample()

        # get log prob of that action
        ln_prob = normal.log_prob(action)
        ln_prob = ln_prob.sum()
        # squeeze action into [-1,1]
        action = torch.tanh(action)
        # turn actions into numpy array
        action = action.numpy()

        return action[0], ln_prob

    def train(self, trajectory):
        '''
        The training is done using the rewards-to-go formulation of the policy gradient update of Reinforce.
        If we are using a baseline, the value network is also trained.
        trajectory: a list of the form [( state , action , lnP(a_t|s_t), reward ), ...  ]
        '''

        log_probs = [item[2] for item in trajectory]
        rewards = [item[3] for item in trajectory]
        states = [item[0] for item in trajectory]
        actions = [item[1] for item in trajectory]

        #calculate rewards to go
        #    R = 0
        #     returns = []
        #      for r in rewards[::-1]:
        #           R = r + self.gamma * R
        #            returns.insert(0, R)

        #   returns = torch.tensor(returns)
        value_estimates = []
        for state in states:
            state = torch.from_numpy(state).float().unsqueeze(
                0)  # just to make it a Tensor obj
            value_estimates.append(self.critic(state))

        next_state_estimates = []
        for indx in range(1, len(value_estimates)):
            next_state_estimates.append(
                np.asscalar(value_estimates[indx].detach().numpy()))

        next_state_estimates.append(0)
        # print(next_state_estimates)
        # print(value_estimates)

        value_estimates = torch.stack(value_estimates).squeeze()
        # next_state_estimates = torch.stack(next_state_estimates).squeeze()

        boostrap_estimate = []
        for indx in range(len(rewards)):
            G = rewards[indx] + self.gamma * next_state_estimates[indx]
            boostrap_estimate.append(G)
        boostrap_estimate = torch.Tensor(boostrap_estimate)

        # train the Value Network and calculate Advantage
        if self.baseline:

            # calculate advantage
            advantage = []
            for value, R in zip(value_estimates, boostrap_estimate):
                advantage.append(R - value)

            advantage = torch.Tensor(advantage)

            # caluclate policy loss
            policy_loss = []
            for log_prob, adv in zip(log_probs, advantage):
                policy_loss.append(-log_prob * adv)

        else:
            policy_loss = []
            for log_prob, R in zip(log_probs, boostrap_estimate):
                policy_loss.append(-log_prob * R)

        # update value network
        for _ in range(self.train_v_iters):

            v_loss = F.mse_loss(value_estimates, boostrap_estimate)
            # update the weights
            self.critic_optimizer.zero_grad()
            v_loss.backward()
            self.critic_optimizer.step()

        # update policy network
        policy_loss = torch.stack(policy_loss).sum()
        # update policy weights
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        return policy_loss, v_loss