Esempio n. 1
0
class DDPG:
    """
        Deep Deterministic Policy Gradient Algorithm.
        Sourced By: https://github.com/stevenpjg/ddpg-aigym/blob/master/ddpg.py
    """
    def __init__(self, num_states, num_actions, action_space_high,
                 action_space_low, is_batch_norm):

        self.num_states = num_states
        self.num_actions = num_actions
        self.action_space_high = action_space_high
        self.action_space_low = action_space_low

        # Batch normalisation disabled.
        self.critic_net = CriticNet(self.num_states, self.num_actions)
        self.actor_net = ActorNet(self.num_states, self.num_actions)

        # Replay Memory 초기화
        self.replay_memory = deque()

        # time 초기화
        self.time_step = 0
        self.counter = 0

        action_max = np.array(action_space_high).tolist()
        action_min = np.array(action_space_low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)

    # observation_1 = state at time t
    # observation 2 = state at time (t + 1)
    def add_experience(self, observation_1, observation_2, action, reward,
                       done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done

        self.replay_memory.append((self.observation_1, self.observation_2,
                                   self.action, self.reward, self.done))
        self.time_step = self.time_step + 1

        # Replay memory 가 가득차면 맨 첫 번째 memory 를 삭제한다
        if (len(self.replay_memory) > REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()

    def minibatches(self):
        # BATCH_SIZE 만큼 replay memory에서 가져온다.
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        # S(t) 와 S(T + 1), action, reward, done 에 대한 batch를
        # 각각 따로 저장한다
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array(self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(
            self.action_batch, [len(self.action_batch), self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)

    def train(self):
        print "######## Starting to train..."
        # batch 뽑기
        self.minibatches()
        # S(t + 1) 정보를 가지고 time (t + 1)에서의 action batch 생성
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(
            self.state_t_1_batch)
        # Q`(S(t + 1), a(t + 1))
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,
                                                       self.action_t_1_batch)
        print "#### Evaluated ciritic value(Q value)"
        print q_t_1
        self.y_i_batch = []  # reward batch 의 item 을 가공하여 저장하는 곳

        for i in range(0, BATCH_SIZE):

            # done == True 이면 terminal state로 간 것이므로
            # 이 때의 reward 를 정답상태로 갔을 때의 reward 라고 할 수 있다
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            # False 이면 terminal state 는 아니므로 reward에 (감마 * Q value) 값을 더한다
            else:
                self.y_i_batch.append(self.reward_batch[i] +
                                      GAMMA * q_t_1[i][0])

        self.y_i_batch = np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1])

        # loss 를 최소화하여 critic network 를 업데이트 한다
        # weight 을 업데이트 하는데 (y_i_batch - (state_t_batch, action_batch) 에서 예측한 y value) 가 최소가 되도록 한다
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,
                                     self.y_i_batch)

        # gradient 에 따라 actor 를 업데이트 한다
        action_for_delQ = self.evaluate_actor(self.state_t_batch)

        if is_grad_inverter:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ)
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)[0]

        # actor network 학습
        self.actor_net.train_actor(self.state_t_batch, self.del_Q_a)

        # target critic, target actor network 업데이트
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
        self.critic_net.save_critic("model/critic_model.ckpt")
        self.actor_net.save_actor("model/actor_model.ckpt")
        print "######## Finish to train ..."
Esempio n. 2
0
class DDPGAgent(Agent):
    ''' stevenpjg's implementation of DDPG algorithm '''

    REPLAY_MEMORY_SIZE = 10000
    BATCH_SIZE = 64
    GAMMA = 0.99

    def __init__(self, env, is_batch_norm=False, is_grad_inverter=True):
        super().__init__(env)
        assert isinstance(env.action_space, Box), "action space must be continuous"
        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.observation_space_size,
                                           self.action_space_size)
            self.actor_net = ActorNet_bn(self.observation_space_size,
                                         self.action_space_size)

        else:
            self.critic_net = CriticNet(self.observation_space_size,
                                        self.action_space_size)
            self.actor_net = ActorNet(self.observation_space_size,
                                      self.action_space_size)

        self.is_grad_inverter = is_grad_inverter
        self.replay_memory = deque()

        self.time_step = 0

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    def add_data_fetch(self, df):
        self.data_fetch = df
        self.data_fetch.add_timers(['ev_p_t', 'ev_q_t', 'y',
                                    'train_q', 'train_p',
                                    'up_q_t', 'up_p_t'],
                                   prefix='t_agent_training_')
        self.data_fetch.add_array('actors_result')

    def get_name(self):
        return 'DDPG' + super().get_name()

    def act(self, state):
        state = self._np_shaping(state, True)
        result = self.actor_net.evaluate_actor(state).astype(float)
        self.data_fetch.add_to_array('actors_result', result)
        return result

    def observe(self, episode):
        episode['obs'] = self._np_shaping(episode['obs'], True)
        episode['action'] = self._np_shaping(episode['action'], False)
        episode['obs2'] = self._np_shaping(episode['obs2'], True)
        self.add_experience(episode)

    def add_experience(self, episode):
        self.replay_memory.append(episode)

        self.time_step += 1
        if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

        if len(self.replay_memory) > type(self).BATCH_SIZE:
            res = self.train()
            return res
        else:
            return None

    def minibatches(self):
        batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
        # state t
        state = self._np_shaping(np.array([item['obs'] for item in batch]), True)
        # action
        action = self._np_shaping(np.array([item['action'] for item in batch]), False)
        # reward
        reward = np.array([item['reward'] for item in batch])
        # state t+1
        state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True)
        # doneA
        done = np.array([item['done'] for item in batch])

        return state, action, reward, state_2, done

    def train(self):
        # sample a random minibatch of N transitions from R
        state, action, reward, state_2, done = self.minibatches()

        actual_batch_size = len(state)

        self.data_fetch.reset_timers()
        target_action = self.actor_net.evaluate_target_actor(state)
        self.data_fetch.sample_timer('ev_p_t')  # ------

        # Q'(s_i+1,a_i+1)
        q_t = self.critic_net.evaluate_target_critic(state_2, target_action)
        self.data_fetch.sample_timer('ev_q_t')  # ------

        y = []
        for i in range(0, actual_batch_size):

            if done[i]:
                y.append(reward[i])
            else:
                y.append(reward[i] + type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t

        y = np.reshape(np.array(y), [len(y), 1])
        self.data_fetch.sample_timer('y')  # ------

        # Update critic by minimizing the loss
        self.critic_net.train_critic(state, action, y)
        self.data_fetch.sample_timer('train_q')  # ------
        # Update actor proportional to the gradients:
        # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
        action_for_delQ = self.actor_net.evaluate_actor(state)  # dont need wolp action

        if self.is_grad_inverter:
            del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)  # /BATCH_SIZE
            del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
        else:
            del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)[0]  # /BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(state, del_Q_a)
        self.data_fetch.sample_timer('train_p')  # ------

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.data_fetch.sample_timer('up_q_t')  # ------
        self.actor_net.update_target_actor()
        self.data_fetch.sample_timer('up_p_t')  # ------
Esempio n. 3
0
class DDPG:
    """ Deep Deterministic Policy Gradient Algorithm"""
    def __init__(self,env):
        self.env = env 
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        
        #Initialize Actor Network:
        action_bound = env.action_space.high
        self.critic_net = CriticNet(self.num_states, self.num_actions) #self.actor_net is an object
        self.actor_net = ActorNet(self.num_states, self.num_actions, action_bound)
        
        #Initialize Buffer Network:
        self.replay_memory = deque()
        
        #Intialize time step:
        self.time_step = 0
        
        #invert gradients (softthresholding)
        action_bounds = [[3], [-3]] #specify upper bound and lower bound of action space
        #action_bound structure for higher dimension actions[
        #[max_of_action_dim_0, max_of_action_dim_1, ..., max_of_action_dim_10], 
        #[min_of_action_dim_0, min_of_action_dim_1, ..., min_of_action_dim_10] 
        #]
        
        
        self.grad_inv = grad_inverter(action_bounds)
        
        
    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)
    
    def add_experience(self, observation_1, observation_2, action, reward, done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done))
        self.time_step = self.time_step + 1
        if(len(self.replay_memory)>REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()
            
        
    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        #state t
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        #state t+1        
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array( self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)  
                  
                 
    def train(self):
        #sample a random minibatch of N transitions from R
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch)
        #Q'(s_i+1,a_i+1)        
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) 
        
        
        self.y_i_batch=[]         
        for i in range(0,BATCH_SIZE):
                           
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:
                
                self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0])                 
        
        self.y_i_batch=np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1])
        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch)
        
        # Update actor proportional to the gradients:
        
        #actions for computing delQ/dela because 
        action_for_delQ = self.evaluate_actor(self.state_t_batch) #think of if you want to take this action or the action_t_batch itself:
        self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE
        self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ)
        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
               
        
        self.actor_net.train_actor(self.state_t_batch,self.del_Q_a)
 
        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
Esempio n. 4
0
class DDPG:
    """ Deep Deterministic Policy Gradient Algorithm"""
    def __init__(self, env, is_batch_norm):
        self.env = env
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.num_states, self.num_actions)
            self.actor_net = ActorNet_bn(self.num_states, self.num_actions)

        else:
            self.critic_net = CriticNet(self.num_states, self.num_actions)
            self.actor_net = ActorNet(self.num_states, self.num_actions)

        #Initialize Buffer Network:
        self.replay_memory = deque()

        #Intialize time step:
        self.time_step = 0
        self.counter = 0

        action_max = np.array(env.action_space.high).tolist()
        action_min = np.array(env.action_space.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)

    def add_experience(self, observation_1, observation_2, action, reward,
                       done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2,
                                   self.action, self.reward, self.done))
        self.time_step = self.time_step + 1
        if (len(self.replay_memory) > REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()

    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        #state t
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        #state t+1
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array(self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(
            self.action_batch, [len(self.action_batch), self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)

    def train(self):
        #sample a random minibatch of N transitions from R
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(
            self.state_t_1_batch)
        #Q'(s_i+1,a_i+1)
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,
                                                       self.action_t_1_batch)
        self.y_i_batch = []
        for i in range(0, BATCH_SIZE):

            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:

                self.y_i_batch.append(self.reward_batch[i] +
                                      GAMMA * q_t_1[i][0])

        self.y_i_batch = np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1])

        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,
                                     self.y_i_batch)

        # Update actor proportional to the gradients:
        action_for_delQ = self.evaluate_actor(self.state_t_batch)

        if is_grad_inverter:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)  #/BATCH_SIZE
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ)
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)[0]  #/BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(self.state_t_batch, self.del_Q_a)

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
class DDPGAgent(Agent):
    ''' stevenpjg's implementation of DDPG algorithm '''

    REPLAY_MEMORY_SIZE = 10000
    BATCH_SIZE = 64
    GAMMA = 0.99

    def __init__(self,
                 env,
                 dir,
                 is_batch_norm=False,
                 is_grad_inverter=True,
                 training_flag=True):
        super().__init__(env, dir)
        assert isinstance(env.action_space,
                          Box), "action space must be continuous"

        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.observation_space_size,
                                           self.action_space_size)
            self.actor_net = ActorNet_bn(self.observation_space_size,
                                         self.action_space_size)

        else:
            self.critic_net = CriticNet(self.observation_space_size,
                                        self.action_space_size)
            self.actor_net = ActorNet(self.observation_space_size,
                                      self.action_space_size)

        self.is_grad_inverter = is_grad_inverter
        self.training_flag = training_flag
        self.replay_memory = deque()

        self.time_step = 0

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

        self.data_fetch = None

    def add_data_fetch(self, df):
        self.data_fetch = df

    def get_short_name(self):
        return 'DDPG'

    def act(self, state):
        state = self._np_shaping(state, True)
        result = self.actor_net.evaluate_actor(state).astype(float)

        if self.data_fetch:
            self.data_fetch.set_actors_action(result[0].tolist())

        return result

    def observe(self, episode):
        episode['obs'] = self._np_shaping(episode['obs'], True)
        episode['action'] = self._np_shaping(episode['action'], False)
        episode['obs2'] = self._np_shaping(episode['obs2'], True)

        self.add_experience(episode)

    def add_experience(self, episode):
        self.replay_memory.append(episode)

        self.time_step += 1
        if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

        if len(self.replay_memory) > type(self).BATCH_SIZE:
            res = self.train()
            return res
        else:
            return None

    def minibatches(self):
        batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
        # state t
        state = self._np_shaping(np.array([item['obs'] for item in batch]),
                                 True)
        # action
        action = self._np_shaping(np.array([item['action'] for item in batch]),
                                  False)
        # reward
        reward = np.array([item['reward'] for item in batch])
        # state t+1
        state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]),
                                   True)
        # doneA
        done = np.array([item['done'] for item in batch])

        return state, action, reward, state_2, done

    def train(self):
        if not self.training_flag:
            return

        # sample a random minibatch of N transitions from R
        state, action, reward, state_2, done = self.minibatches()

        actual_batch_size = len(state)

        target_action = self.actor_net.evaluate_target_actor(state)

        # Q'(s_i+1,a_i+1)
        q_t = self.critic_net.evaluate_target_critic(state_2, target_action)

        y = []
        for i in range(0, actual_batch_size):

            if done[i]:
                y.append(reward[i])
            else:
                y.append(reward[i] +
                         type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t

        y = np.reshape(np.array(y), [len(y), 1])

        # Update critic by minimizing the loss
        self.critic_net.train_critic(state, action, y)

        # Update actor proportional to the gradients:
        # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
        action_for_delQ = self.actor_net.evaluate_actor(
            state)  # dont need wolp action

        if self.is_grad_inverter:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)  # /BATCH_SIZE
            del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
        else:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)[0]  # /BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(state, del_Q_a)

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()

    def save_agent(self, force=False, comment="default"):
        path = "{}/weights/{}".format(self.get_dir(), comment)
        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)
            print("Saving agent in", path)
            self.actor_net.save_model(path + '/actor.ckpt')
            self.critic_net.save_model(path + '/critic.ckpt')
        else:
            if force:
                print("Overwrite old agent in", path)
                self.actor_net.save_model(path + '/actor.ckpt')
                self.critic_net.save_model(path + '/critic.ckpt')
            else:
                print("Save aborted. An agent is already saved in ", path)

    def load_agent(self, agent_name=None, comment="default"):
        if agent_name is None:
            path = "{}/weights/{}".format(self.get_dir(), comment)
        else:
            path = "{}/{}/{}/weights/{}".format(self.result_dir, agent_name,
                                                self.env.spec.id, comment)
        if os.path.exists(path):
            print("Loading agent saved in", path)
            self.actor_net.load_model(path + '/actor.ckpt')
            self.critic_net.load_model(path + '/critic.ckpt')
        else:
            print("Agent not found in", path)

    def close_session(self):
        self.actor_net.close()
        self.critic_net.close()
Esempio n. 6
0
class DDPGAgent(Agent):
    ''' stevenpjg's implementation of DDPG algorithm '''

    REPLAY_MEMORY_SIZE = 10000
    BATCH_SIZE = 64
    GAMMA = 0.99

    def __init__(self, env, is_batch_norm=False, is_grad_inverter=True):
        super().__init__(env)

        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.observation_space_size,
                                           self.action_space_size)
            self.actor_net = ActorNet_bn(self.observation_space_size,
                                         self.action_space_size)

        else:
            self.critic_net = CriticNet(self.observation_space_size,
                                        self.action_space_size)
            self.actor_net = ActorNet(self.observation_space_size,
                                      self.action_space_size)

        self.is_grad_inverter = is_grad_inverter
        self.replay_memory = deque()

        self.time_step = 0

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    # EREZ ADDED
    def save(self, path):

        # TODO -- robust handling of where to put things

        # Everything from the super class can be pickled easily
        attrs = Container()
        saved_critic = self.critic_net.save(path)
        saved_actor = self.actor_net.save(path)

        i_vars = vars(self)
        keys = i_vars.keys()
        for key in keys:
            tmp = getattr(self, key)

            if not (isinstance(tmp, (CriticNet, ActorNet, grad_inverter))):
                setattr(attrs, key, tmp)

        file = os.path.join(
            path, "agent_data.pkl")  # TODO -- come up with a not stupid name
        with open(file, "wb") as f:
            pickle.dump(attrs, f, pickle.HIGHEST_PROTOCOL)

    # EREZ ADDED
    # def restore(self, file): -- write now its just in the subclass
    def restore(self, path):
        print("restoring the agent")

        file = os.path.join(
            path, "agent_data.pkl")  # TODO -- come up with a not stupid name

        with open(file, "rb") as f:
            dump = pickle.load(f)

            i_vars = vars(dump)
            keys = i_vars.keys()
            for key in keys:
                tmp = getattr(dump, key)
                setattr(self, key, tmp)

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

        # Now replace the networks
        # IGNORE THE "IS BATCH " CONDITION FOR NOW
        saved_critic_net = CriticNet(self.observation_space_size,
                                     self.action_space_size)
        saved_actor_net = ActorNet(self.observation_space_size,
                                   self.action_space_size)

        # Load in the saved graphs
        critic_file = os.path.join(path, "critic_net.ckpt")
        saved_critic_net.restore(critic_file)
        actor_file = os.path.join(path, "actor_net.ckpt")
        saved_actor_net.restore(actor_file)

        self.critic_net = saved_critic_net
        self.actor_net = saved_actor_net

    def add_data_fetch(self, df):
        self.data_fetch = df

    def get_name(self):
        return 'DDPG' + super().get_name()

    def act(self, state):
        state = self._np_shaping(state, True)
        result = self.actor_net.evaluate_actor(state).astype(float)
        self.data_fetch.set_actors_action(result[0].tolist())
        return result

    def observe(self, episode):
        episode['obs'] = self._np_shaping(episode['obs'], True)
        episode['action'] = self._np_shaping(episode['action'], False)
        episode['obs2'] = self._np_shaping(episode['obs2'], True)
        self.add_experience(episode)

    def add_experience(self, episode):
        self.replay_memory.append(episode)

        self.time_step += 1
        if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

        if len(self.replay_memory) > type(self).BATCH_SIZE:
            res = self.train()
            return res
        else:
            return None

    def minibatches(self):
        batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
        # state t
        state = self._np_shaping(np.array([item['obs'] for item in batch]),
                                 True)
        # action
        action = self._np_shaping(np.array([item['action'] for item in batch]),
                                  False)
        # reward
        reward = np.array([item['reward'] for item in batch])
        # state t+1
        state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]),
                                   True)
        # doneA
        done = np.array([item['done'] for item in batch])

        return state, action, reward, state_2, done

    def train(self):
        # sample a random minibatch of N transitions from R
        state, action, reward, state_2, done = self.minibatches()

        actual_batch_size = len(state)

        target_action = self.actor_net.evaluate_target_actor(state)

        # Q'(s_i+1,a_i+1)
        q_t = self.critic_net.evaluate_target_critic(state_2, target_action)

        y = []
        for i in range(0, actual_batch_size):

            if done[i]:
                y.append(reward[i])
            else:
                y.append(reward[i] +
                         type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t

        y = np.reshape(np.array(y), [len(y), 1])

        # Update critic by minimizing the loss
        self.critic_net.train_critic(state, action, y)

        # Update actor proportional to the gradients:
        # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
        action_for_delQ = self.actor_net.evaluate_actor(
            state)  # dont need wolp action

        if self.is_grad_inverter:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)  # /BATCH_SIZE
            del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
        else:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)[0]  # /BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(state, del_Q_a)

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
Esempio n. 7
0
class DDPG:
    
    """ Deep Deterministic Policy Gradient Algorithm"""
    def __init__(self,env, is_batch_norm):
        self.env = env 
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        
        
        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.num_states, self.num_actions) 
            self.actor_net = ActorNet_bn(self.num_states, self.num_actions)
            
        else:
            self.critic_net = CriticNet(self.num_states, self.num_actions) 
            self.actor_net = ActorNet(self.num_states, self.num_actions)
        
        #Initialize Buffer Network:
        self.replay_memory = deque()
        
        #Intialize time step:
        self.time_step = 0
        self.counter = 0
        
        action_max = np.array(env.action_space.high).tolist()
        action_min = np.array(env.action_space.low).tolist()        
        action_bounds = [action_max,action_min] 
        self.grad_inv = grad_inverter(action_bounds)
        
        
    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)
    
    def add_experience(self, observation_1, observation_2, action, reward, done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done))
        self.time_step = self.time_step + 1
        if(len(self.replay_memory)>REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()
            
        
    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        #state t
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        #state t+1        
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array( self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)  
                  
                 
    def train(self):
        #sample a random minibatch of N transitions from R
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch)
        #Q'(s_i+1,a_i+1)        
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) 
        self.y_i_batch=[]         
        for i in range(0,BATCH_SIZE):
                           
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:
                
                self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0])                 
        
        self.y_i_batch=np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1])
        
        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch)
        
        # Update actor proportional to the gradients:
        action_for_delQ = self.evaluate_actor(self.state_t_batch) 
        
        if is_grad_inverter:        
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE            
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) 
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)[0]#/BATCH_SIZE
        
        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(self.state_t_batch,self.del_Q_a)
 
        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()