コード例 #1
0
for i_episode in range(MAX_EPISODE):
    observation = env.reset()
    t = 0
    track_reward = []
    while True:
        if RENDER: env.render()

        action = actor.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        if done: reward = -20

        track_reward.append(reward)

        td_error = critic.learn(observation, reward, observation_)
        actor.learn(observation, action, td_error)

        observation = observation_
        t += 1

        if done or t > MAX_EP_STEP:
            ep_rs_sum = sum(track_reward)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_THRESHOLD: RENDER = True
            print "Episode: %d | Reward: %d" % (i_episode, running_reward)
            break
コード例 #2
0
Action_dim = 4

sess = tf.Session()

actor = Actor(sess, State_dim=State_dim, Action_dim=Action_dim, lr=Actor_lr)
critic = Critic(sess, State_dim=State_dim, lr=Critic_lr)

sess.run(tf.global_variables_initializer())

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = 0
    total_action = []
    done = False
    while (not done and t < 200):

        a = actor.choose_action(s)

        s_, r, done = env.step(env.t_action[a])
        total_action.append(env.t_action[a])
        if done: r = -200
        td_error = critic.learn(s, -r, s_)
        actor.learn(s, a, td_error)

        s = s_
        track_r += r
        t += 1
    print("episode:", i_episode, "  tracked actions to attempt goal:",
          total_action)
コード例 #3
0
class Agent:
    def __init__(self, input_dim, output_dim, lr, gamma, tau, clipnorm,
                 verbose):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.actions = range(output_dim)
        self.lr = lr
        self.gamma = gamma
        self.tau = tau

        #Buffer for experience replay
        self.S = []
        self.A = []
        self.R = []
        self.S1 = []
        self.D = []
        self.memory_size = 10**3

        #Make actor and critic
        self.actor = Actor(input_dim, output_dim, lr, gamma, tau, clipnorm,
                           verbose)
        self.critic = Critic(input_dim, output_dim, lr, gamma, tau, clipnorm,
                             verbose)

    def learn(self):

        #get batch
        S, A, R, S1, D = self.get_batch()

        #Find advantage
        #G = self.find_discounted_return(R)
        #V = self.critic.model.predict(S)
        #V.resize(len(V))
        #adv = G - V

        #Train critic
        D0, R0 = np.array([[x] for x in D
                           ]), np.array([[x] for x in R
                                         ])  #hack to get working for now
        V1 = self.critic.model.predict(S1)
        self.critic.learn(S, R0, D0, V1)
        self.soft_update_target_network(self.critic)

        #Find advantage
        V, V1 = self.critic.model.predict(S), self.critic.model.predict(S1)
        adv = R0 + self.gamma * V1 - V
        adv = adv.flatten()

        #train actor
        self.actor.learn(S, A, adv)
        self.soft_update_target_network(self.actor)

        #Clear memory
        self.S, self.A, self.R, self.S1, self.D = [], [], [], [], []

    def find_discounted_return(self, rewards):
        R = np.zeros_like(rewards)
        rolling_sum = 0
        for t in reversed(range(len(R))):
            rolling_sum = rolling_sum * self.gamma + rewards[t]
            R[t] = rolling_sum

        #Normalize rewards
        R -= np.mean(R)
        R /= np.std(R)

        return np.array(R)

    def remember(self, state, action, reward, next_state, done):
        """ Add experience to buffer """

        self.S.append(state)
        action_onehot = to_categorical(
            action, self.output_dim)  #optimizers use one-hot
        self.A.append(action_onehot)
        self.R.append(reward)
        self.S1.append(next_state)
        self.D.append(done * 1.0)

    def get_batch(self):
        #indicies = np.random.choice(range(len(self.S)),self.batchsize)
        indicies = range(len(self.S))
        S = np.array(self.S)[indicies]
        A = np.array(self.A)[indicies]
        R = np.array(self.R)[indicies]
        S1 = np.array(self.S1)[indicies]
        D = np.array(self.D)[indicies]
        return S, A, R, S1, D

    def act(self, state):
        """ Choose action according to softmax """

        probs = self.actor.model.predict(state)[0]
        action = np.random.choice(self.actions, p=probs)
        return action

    def make_tensor(self, vec):
        """ Turns a 1D array, x, into a 2d array / tensor =  [x]
            So that keras can read it
        """

        vec = np.reshape(vec, (1, len(vec)))
        return vec

    def save_target_weights(self):
        """ Saves the weights of the target 
            networks (only use the target during
            testing, so don't need to save the
            behavior network)
        """

        #Create directory if it doesn't exist
        dir_name = 'network_weights/'
        if not os.path.exists(os.path.dirname(dir_name)):
            os.makedirs(os.path.dirname(dir_name))

        #Now save the weights. I'm choosing ID by gamma, lr, tau
        pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str(
            self.lr) + '_tau_' + str(self.tau)  #save attached the extension

        #Actor target network
        filename = 'network_weights/actor_target'
        actor_pars = self.actor.target_model.get_weights()
        np.save(filename + pars_tag, actor_pars)

        #Critic target network
        filename = 'network_weights/critic_target'
        critic_pars = self.critic.target_model.get_weights()
        np.save(filename + pars_tag, critic_pars)

    def load_target_weights(self, gamma, lr, tau):
        """ Loads the weights of the target 
            networks, previously created using
            the save_target_wieghts() function
        """

        #Now save the weights. I'm choosing ID by gamma, lr, tau
        pars_tag = '_gamma_' + str(gamma) + '_lr_' + str(lr) + '_tau_' + str(
            tau) + '.npy'

        #Actor target network
        filename = 'network_weights/actor_target'
        actor_pars = np.load(filename + pars_tag)
        self.actor.target_model.set_weights(actor_pars)

        #Critic target network
        filename = 'network_weights/critic_target'
        critic_pars = np.load(filename + pars_tag)
        self.critic.target_model.set_weights(critic_pars)

    def soft_update_target_network(self, net):
        """
        Updates parameters of the target network according to the following
        where tau is a hyper parameter.
        
        theta_target = (1-tau)*theta_target + tau*theta_behavior
        
        Input: network = Actor or Critic class
        """

        pars_behavior = net.model.get_weights(
        )  # these have form [W1, b1, W2, b2, ..], Wi =
        pars_target = net.target_model.get_weights()  # bi = biases in layer i

        ctr = 0
        for par_behavior, par_target in zip(pars_behavior, pars_target):
            par_target = par_target * (1 - self.tau) + par_behavior * self.tau
            pars_target[ctr] = par_target
            ctr += 1

        net.target_model.set_weights(pars_target)
コード例 #4
0
class Agent:
    def __init__(self, input_dim, output_dim, lr, gamma, seed_num=False):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.actions = range(output_dim)
        self.lr = lr
        self.gamma = gamma
        self.tau = 0.1
        self.seed_num = seed_num

        #For experience replay
        self.memory = []
        self.memory_size = 10000
        self.batchsize = 32

        #Actor & critic
        self.actor = Actor(input_dim, output_dim, self.lr)
        self.critic = Critic(input_dim, output_dim, self.lr, self.gamma)

        if seed_num != False:
            set_random_seed(seed_num)  #seed tensorflow
            seed(seed_num)  #seed numpy

    def remember(self, state, action, reward, next_state, done):
        event = (state, action, reward, next_state, done)
        if len(self.memory) <= self.memory_size:
            self.memory.append(event)
        else:
            self.memory[0] = event

    def act(self, state):

        #The softmax gumbel trick outputs an almost 1-hot vector (i.e elements sum to one, with way way bigger than others)
        #I need to turn this into a 'hard' onehot vector
        action_soft_onehot = self.actor.model.predict(state)[0]
        action_index = np.argmax(action_soft_onehot)
        action_hard_onehot = np.array([
            1 if i == action_index else 0
            for i in range(len(action_soft_onehot))
        ])
        return action_hard_onehot

    def extract_from_batch(self, batch):
        states, actions = [], []
        for event in batch:
            state, action, reward, next_state, done = event
            states.append(state)
            actions.append(action)
        return np.array(states), np.array(actions)

    def train_models(self):

        #Do experience replay
        if len(self.memory) < self.batchsize:
            minibatch = self.memory
        else:
            minibatch = random.sample(self.memory, self.batchsize)

        #Actor update
        states, actions = self.extract_from_batch(minibatch)
        grad_actions = self.critic.find_action_grads([states, actions])[0]
        self.actor.learn(states, grad_actions)
        self.soft_update_target_network(self.actor)

        #Critic update
        self.critic.learn(minibatch)
        self.soft_update_target_network(self.critic)

    def soft_update_target_network(self, net):
        """
        Updates parameters of the target network according to the following
        where tau is a hyper parameter.
        
        theta_target = (1-tau)*theta_target + tau*theta_behavior
        
        Input: network = Actor or Critic class
        """

        pars_behavior = net.model.get_weights(
        )  # these have form [W1, b1, W2, b2, ..], Wi =
        pars_target = net.target_model.get_weights()  # bi = biases in layer i

        ctr = 0
        for par_behavior, par_target in zip(pars_behavior, pars_target):
            par_target = par_target * (1 - self.tau) + par_behavior * self.tau
            pars_target[ctr] = par_target
            ctr += 1

        net.target_model.set_weights(pars_target)

    def save_target_weights(self):
        """ Saves the weights of the target 
            network (only use the target during
            testing, so don't need to save tje
            behavior)
        """

        #Create directory if it doesn't exist
        dir_name = 'network_weights/'
        if not os.path.exists(os.path.dirname(dir_name)):
            os.makedirs(os.path.dirname(dir_name))

        #Now save the weights. I'm choosing ID by gamma, lr, tau
        if self.seed_num == False:
            pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str(
                self.lr) + '_tau_' + str(self.tau)
        else:
            pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str(self.lr) + '_tau_' + str(self.tau) \
            +'_seed_' + str(self.seed_num)

        #Actor target network
        filename = 'network_weights/actor_target'
        actor_pars = self.actor.target_model.get_weights()
        np.save(filename + pars_tag, actor_pars)

        #Critic target network
        filename = 'network_weights/critic_target'
        critic_pars = self.critic.target_model.get_weights()
        np.save(filename + pars_tag, critic_pars)

    def load_target_weights(self, gamma, lr, tau):
        """ Loads the weights of the target 
            network, previously created using
            the save_target_wieghts() function
        """

        #Now save the weights. I'm choosing ID by gamma, lr, tau
        #Now save the weights. I'm choosing ID by gamma, lr, tau
        if self.seed_num == False:
            pars_tag = '_gamma_' + str(self.gamma) + '_lr_' + str(
                self.lr) + '_tau_' + str(self.tau) + '.npy'
        else:
            pars_tag = '_gamma_' + str(self.gamma)+'_lr_'+str(self.lr)+'_tau_'+str(self.tau)+'_seed_' \
            +str(self.seed_num)+ '.npy'

        #Actor target network
        filename = 'network_weights/actor_target'
        actor_pars = np.load(filename + pars_tag)
        self.actor.target_model.set_weights(actor_pars)

        #Critic target network
        filename = 'network_weights/critic_target'
        critic_pars = np.load(filename + pars_tag)
        self.critic.target_model.set_weights(critic_pars)