コード例 #1
0
ファイル: ddpg.py プロジェクト: YoonHan/deep-learning-project
class DDPG:
    """
        Deep Deterministic Policy Gradient Algorithm.
        Sourced By: https://github.com/stevenpjg/ddpg-aigym/blob/master/ddpg.py
    """
    def __init__(self, num_states, num_actions, action_space_high,
                 action_space_low, is_batch_norm):

        self.num_states = num_states
        self.num_actions = num_actions
        self.action_space_high = action_space_high
        self.action_space_low = action_space_low

        # Batch normalisation disabled.
        self.critic_net = CriticNet(self.num_states, self.num_actions)
        self.actor_net = ActorNet(self.num_states, self.num_actions)

        # Replay Memory 초기화
        self.replay_memory = deque()

        # time 초기화
        self.time_step = 0
        self.counter = 0

        action_max = np.array(action_space_high).tolist()
        action_min = np.array(action_space_low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)

    # observation_1 = state at time t
    # observation 2 = state at time (t + 1)
    def add_experience(self, observation_1, observation_2, action, reward,
                       done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done

        self.replay_memory.append((self.observation_1, self.observation_2,
                                   self.action, self.reward, self.done))
        self.time_step = self.time_step + 1

        # Replay memory 가 가득차면 맨 첫 번째 memory 를 삭제한다
        if (len(self.replay_memory) > REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()

    def minibatches(self):
        # BATCH_SIZE 만큼 replay memory에서 가져온다.
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        # S(t) 와 S(T + 1), action, reward, done 에 대한 batch를
        # 각각 따로 저장한다
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array(self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(
            self.action_batch, [len(self.action_batch), self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)

    def train(self):
        print "######## Starting to train..."
        # batch 뽑기
        self.minibatches()
        # S(t + 1) 정보를 가지고 time (t + 1)에서의 action batch 생성
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(
            self.state_t_1_batch)
        # Q`(S(t + 1), a(t + 1))
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,
                                                       self.action_t_1_batch)
        print "#### Evaluated ciritic value(Q value)"
        print q_t_1
        self.y_i_batch = []  # reward batch 의 item 을 가공하여 저장하는 곳

        for i in range(0, BATCH_SIZE):

            # done == True 이면 terminal state로 간 것이므로
            # 이 때의 reward 를 정답상태로 갔을 때의 reward 라고 할 수 있다
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            # False 이면 terminal state 는 아니므로 reward에 (감마 * Q value) 값을 더한다
            else:
                self.y_i_batch.append(self.reward_batch[i] +
                                      GAMMA * q_t_1[i][0])

        self.y_i_batch = np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1])

        # loss 를 최소화하여 critic network 를 업데이트 한다
        # weight 을 업데이트 하는데 (y_i_batch - (state_t_batch, action_batch) 에서 예측한 y value) 가 최소가 되도록 한다
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,
                                     self.y_i_batch)

        # gradient 에 따라 actor 를 업데이트 한다
        action_for_delQ = self.evaluate_actor(self.state_t_batch)

        if is_grad_inverter:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ)
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)[0]

        # actor network 학습
        self.actor_net.train_actor(self.state_t_batch, self.del_Q_a)

        # target critic, target actor network 업데이트
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
        self.critic_net.save_critic("model/critic_model.ckpt")
        self.actor_net.save_actor("model/actor_model.ckpt")
        print "######## Finish to train ..."
コード例 #2
0
class DDPGAgent(Agent):
    ''' stevenpjg's implementation of DDPG algorithm '''

    REPLAY_MEMORY_SIZE = 10000
    BATCH_SIZE = 64
    GAMMA = 0.99

    def __init__(self, env, is_batch_norm=False, is_grad_inverter=True):
        super().__init__(env)
        assert isinstance(env.action_space, Box), "action space must be continuous"
        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.observation_space_size,
                                           self.action_space_size)
            self.actor_net = ActorNet_bn(self.observation_space_size,
                                         self.action_space_size)

        else:
            self.critic_net = CriticNet(self.observation_space_size,
                                        self.action_space_size)
            self.actor_net = ActorNet(self.observation_space_size,
                                      self.action_space_size)

        self.is_grad_inverter = is_grad_inverter
        self.replay_memory = deque()

        self.time_step = 0

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    def add_data_fetch(self, df):
        self.data_fetch = df
        self.data_fetch.add_timers(['ev_p_t', 'ev_q_t', 'y',
                                    'train_q', 'train_p',
                                    'up_q_t', 'up_p_t'],
                                   prefix='t_agent_training_')
        self.data_fetch.add_array('actors_result')

    def get_name(self):
        return 'DDPG' + super().get_name()

    def act(self, state):
        state = self._np_shaping(state, True)
        result = self.actor_net.evaluate_actor(state).astype(float)
        self.data_fetch.add_to_array('actors_result', result)
        return result

    def observe(self, episode):
        episode['obs'] = self._np_shaping(episode['obs'], True)
        episode['action'] = self._np_shaping(episode['action'], False)
        episode['obs2'] = self._np_shaping(episode['obs2'], True)
        self.add_experience(episode)

    def add_experience(self, episode):
        self.replay_memory.append(episode)

        self.time_step += 1
        if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

        if len(self.replay_memory) > type(self).BATCH_SIZE:
            res = self.train()
            return res
        else:
            return None

    def minibatches(self):
        batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
        # state t
        state = self._np_shaping(np.array([item['obs'] for item in batch]), True)
        # action
        action = self._np_shaping(np.array([item['action'] for item in batch]), False)
        # reward
        reward = np.array([item['reward'] for item in batch])
        # state t+1
        state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True)
        # doneA
        done = np.array([item['done'] for item in batch])

        return state, action, reward, state_2, done

    def train(self):
        # sample a random minibatch of N transitions from R
        state, action, reward, state_2, done = self.minibatches()

        actual_batch_size = len(state)

        self.data_fetch.reset_timers()
        target_action = self.actor_net.evaluate_target_actor(state)
        self.data_fetch.sample_timer('ev_p_t')  # ------

        # Q'(s_i+1,a_i+1)
        q_t = self.critic_net.evaluate_target_critic(state_2, target_action)
        self.data_fetch.sample_timer('ev_q_t')  # ------

        y = []
        for i in range(0, actual_batch_size):

            if done[i]:
                y.append(reward[i])
            else:
                y.append(reward[i] + type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t

        y = np.reshape(np.array(y), [len(y), 1])
        self.data_fetch.sample_timer('y')  # ------

        # Update critic by minimizing the loss
        self.critic_net.train_critic(state, action, y)
        self.data_fetch.sample_timer('train_q')  # ------
        # Update actor proportional to the gradients:
        # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
        action_for_delQ = self.actor_net.evaluate_actor(state)  # dont need wolp action

        if self.is_grad_inverter:
            del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)  # /BATCH_SIZE
            del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
        else:
            del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)[0]  # /BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(state, del_Q_a)
        self.data_fetch.sample_timer('train_p')  # ------

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.data_fetch.sample_timer('up_q_t')  # ------
        self.actor_net.update_target_actor()
        self.data_fetch.sample_timer('up_p_t')  # ------
コード例 #3
0
ファイル: agent.py プロジェクト: yejunhong1/DRAG
class DDPG:

    """ Deep Deterministic Policy Gradient Algorithm

    hisar_size: size of the history ar vector/tensor

    action_size: size of the action vector/tensor

    TAU: update rate of target network parameters

    is_batch_norm: if apply batch norm

    write_sum: key/interval for writing summary data to file
    """
    def __init__( self, hisar_size, ar_size, action_size, TAU = 0.001, is_batch_norm = 0, write_sum = 0, net_size_scale=1, max_load=1, beta0=beta):
        self.hisar_size  = hisar_size
        self.load_size   = action_size + 1
        self.ar_size     = ar_size
        self.state_size  = action_size * 2
        self.action_size = action_size
        self.ar_action_size = ar_size + action_size

        #print("net_size_scale: "+str(net_size_scale))
        if is_batch_norm:
            if len(CN_N_HIDDENS)==2:
                self.critic_net   = CriticNet_bn(  self.state_size, self.action_size, TAU, write_sum, net_size_scale  )
            else:
                self.critic_net   = CriticNet_bn_3(  self.state_size, self.action_size, TAU, write_sum, net_size_scale  )
            self.actor_net    = ActorNet_bn(   self.state_size, self.action_size, TAU, write_sum, net_size_scale  )
            self.ar_pred_net  = ARPredNet_bn(  self.hisar_size, self.ar_size,     write_sum, net_size_scale )           # arrival rate prediction network
            self.load_map_net = LoadMapNet_bn( self.ar_size,    self.action_size, self.load_size, write_sum, net_size_scale )           # load mapping network
        else:
            self.critic_net   = CriticNet(  self.state_size, self.action_size, TAU, write_sum, net_size_scale )
            self.actor_net    = ActorNet(   self.state_size, self.action_size, TAU, write_sum, net_size_scale )
            self.ar_pred_net  = ARPredNet(  self.hisar_size, self.ar_size,     write_sum, net_size_scale )           # arrival rate prediction network
            self.load_map_net = LoadMapNet( self.ar_size,    self.action_size, self.load_size, write_sum, net_size_scale )           # load mapping network

        self.env = ENV( action_size, max_load=max_load, beta0=beta0 )

        #self.k_nearest_neighbors = int(max_actions * k_ratio )
        #Initialize Network Buffers:
        self.replay_memory_ac  = deque()
        self.replay_memory_arp = deque()
        self.replay_memory_lm  = deque()

        #Intialize time step:
        self.time_step = 0
        self.counter   = 0
        
        action_max    = np.ones(  ( self.action_size ) ).tolist()
        action_min    = np.zeros( ( self.action_size ) ).tolist()
        action_bounds = [action_max, action_min] 
        self.grad_inv = grad_inverter( action_bounds )
        
    def construct_state( self, pred_ar, pre_action=[] ):
        """Construct a state with the predicted ar and previous action
        """
        num_sbs    = np.max( pred_ar.shape )
        pred_ar    = np.reshape( np.array( pred_ar  ),   (1, num_sbs) )
        pre_action = np.reshape( np.array( pre_action ), (1, num_sbs) )
        state      = np.concatenate( (pred_ar, pre_action), axis=1 )
        return state.tolist()

    def evaluate_actor( self, state_t ):
        """Evaluate the actor network to get an action
        """
        p_action = self.actor_net.evaluate_actor( state_t )
        return p_action
    
    def add_experience_ac( self, state, next_state, action, reward ):
        """Add data sample of the Actor-Critic network
        """
        self.state      = state
        self.next_state = next_state
        self.action     = action
        self.reward     = reward
        #if reward>0:
        self.replay_memory_ac.append( (self.state, self.next_state, self.action, self.reward) )
        
        self.time_step = self.time_step + 1
        if( len(self.replay_memory_ac) > AC_REPLAY_MEMORY_SIZE ):
            self.replay_memory_ac.popleft()
            
  
    def add_experience_arp( self, his_ar, pred_ar ):
        """Add data sample of the arrival rate prediction network
        """
        self.replay_memory_arp.append( (his_ar, pred_ar) )
        if( len(self.replay_memory_arp) > ARP_REPLAY_MEMORY_SIZE ):
            self.replay_memory_arp.popleft()

    def add_experience_lm( self, ar_action, mapped_load ):
        """Add data sample of the load mapping network
        """
        self.replay_memory_lm.append( (ar_action, mapped_load) )
        if( len(self.replay_memory_lm) > LM_REPLAY_MEMORY_SIZE ):
            self.replay_memory_lm.popleft()

    def refine_action(self, state, action_in, imp = 0):
        """ round up the action to [0,1], then if imp>0,
        get the p_action's nearest neighbors, return the one with the max metric value, 
        imp==1, metric = Q value; 
        imp==2, metric = Q value + reward; 
        imp==3, metric = reward.
        """
        action0 = np.round( action_in )
        action  = np.clip(  action0, 0, 1 )
        #print("in refine action: "+str(action))
        if imp>0:
            action  = self.improve_action(state, action, imp)
        return action

    def improve_action(self, state, p_action, greedy=1):
        """ get the p_action's nearest neighbors, return the one with the max metric value
        greedy==1, metric = Q value
        greedy==2, metric = Q value + reward
        greedy==3, metric = reward
        """
        state0   = state[0]
        ac_size  = np.max(p_action.shape)
        ar_size  = len(state0)-ac_size
        p_action = np.array(p_action)

        # if the action would cause outage, greedily modify the action
        pred_ar     = state0[0:ar_size]      # predicted ar 
        pred_ar     = np.reshape( pred_ar,    [1, ar_size] )
        prev_action = state0[ac_size: ]
        #print("p_action: "+str(p_action))
        reward = -1
        while reward < 0:
            map_load    = self.load_map_net.evaluate_load_map( pred_ar, [p_action] )
            map_load[0][-1] += 0.05                      # conservatively estimate the load of the mbs
            reward, _, _, _, _ = self.env.find_reward( map_load[0], p_action, prev_action )
            #print("est reward: "+str(reward))
            if reward < 0:
                t_ar = [a*(1-b) for a,b in zip(pred_ar[0], p_action)]
                if max(t_ar)==0:
                    #print("---------------------tried best, still negative reward, break...")
                    break
                max_index  = np.argmax( t_ar )
                p_action[max_index] = 1
                #print("---------------------negative reward, change the "+str(max_index)+" action to 1")

        # find the nearest neighbors
        actions = np.zeros( (ac_size+1, ac_size) )
        for i in range(0, ac_size):
            t_action    = copy.deepcopy(p_action)
            t_action[i] = 1-t_action[i]
            actions[ i] = t_action
        actions[ac_size] = copy.deepcopy(p_action)
        
        metrics = np.zeros( ( ac_size+1 ) )
        if greedy <=2:
            # make all the (state, action) pairs for the critic
            states   = np.tile(state, [len(actions), 1])
            # evaluate each pair through the critic
            q_values = self.critic_net.evaluate_critic(states, actions)
            # find the index of the pair with the maximum value
            metrics += np.reshape( q_values, ( ac_size+1 ) )
            #print("q values: "+str(metrics))
        if greedy >=2:

            rewards = np.zeros( ( ac_size+1 ) )
            for i in range(0,ac_size+1):
                taction  = np.reshape( actions[i], [1, ac_size] )
                map_load = self.load_map_net.evaluate_load_map( pred_ar, taction )
                map_load[0][-1] += 0.02                      # conservatively estimate the load of the mbs
                rewards[i], _, _, _, _ = self.env.find_reward( map_load[0], actions[i], prev_action )
            metrics = rewards + GAMMA*metrics

        max_index  = np.argmax( metrics )   # 
        action_out = actions[max_index]
        #if max_index != ac_size:
        #    print("Improve "+str(p_action)+" to "+str(action_out))
        # return the best action
        return action_out




    
    def get_minibatch_ac(self):
        """Get mini batch for training of actor-critic network
        """
        batch = random.sample( self.replay_memory_ac, AC_BATCH_SIZE )
        #state t
        self.batch_states = [item[0] for item in batch]
        self.batch_states = np.reshape( np.array(self.batch_states), (AC_BATCH_SIZE, self.state_size ) )
        #state t+1        
        self.batch_next_states = [item[1] for item in batch]
        self.batch_next_states = np.reshape( np.array( self.batch_next_states), (AC_BATCH_SIZE, self.state_size ) )
        
        self.batch_actions = [item[2] for item in batch]
        self.batch_actions = np.reshape( np.array( self.batch_actions), [len(self.batch_actions), self.action_size] )
        
        self.batch_rewards = [item[3] for item in batch]
        self.batch_rewards = np.array( self.batch_rewards )


    
    def get_minibatch_arp(self):
        """Get mini batch for training of arrival rate prediction network
        """
        batch = random.sample( self.replay_memory_arp, ARP_BATCH_SIZE )
        #history ars
        self.his_ars = [item[0] for item in batch]
        #print("his_ars: "+str(np.array(self.his_ars)))
        self.his_ars = np.reshape( np.array(self.his_ars), (ARP_BATCH_SIZE, self.hisar_size) )
        #state t+1
        self.next_ars = [item[1] for item in batch]
        self.next_ars = np.reshape( np.array( self.next_ars), (ARP_BATCH_SIZE, self.ar_size ) )
    
    def get_minibatch_lm(self):
        """Get mini batch for training of load mapping network
        """
        batch = random.sample( self.replay_memory_lm, LM_BATCH_SIZE )
        #history ars
        self.ar_action = [item[0] for item in batch]
        self.ar_action = np.reshape( np.array(self.ar_action), (LM_BATCH_SIZE, self.ar_action_size) )
        #state t+1
        self.mapped_load = [item[1] for item in batch]
        self.mapped_load = np.reshape( np.array( self.mapped_load), (LM_BATCH_SIZE, self.load_size ) )
    
    
    def train_ac( self, learning_rate=[0.0001, 0.0001], update_target=0):
        """Train actor-critic network with a minibatch from the replay memory
        """
        cerror = 1
        aerror = 1
        if( len( self.replay_memory_ac ) > AC_BATCH_SIZE ):
            # Sample a random minibatch of N transitions from R
            self.get_minibatch_ac()
            self.batch_next_taction = self.actor_net.evaluate_target_actor( self.batch_next_states )

            # Q'(s_i+1,a_i+1)        
            batch_next_tQ = self.critic_net.evaluate_target_critic( self.batch_next_states, self.batch_next_taction ) 
            
            # r + gamma*Q'(s_i+1,a_i+1)     
            self.batch_next_obj_Q = []
            for i in range(0,AC_BATCH_SIZE):
                self.batch_next_obj_Q.append( self.batch_rewards[i] + GAMMA*batch_next_tQ[i][0] )        
        
            self.batch_next_obj_Q = np.array(   self.batch_next_obj_Q )
            self.batch_next_obj_Q = np.reshape( self.batch_next_obj_Q, [len(self.batch_next_obj_Q),1] )
            #print("tQ: "+str(np.reshape( batch_next_tQ, [1, len(batch_next_tQ)]) ) )
            # Update critic by minimizing the loss
            cerror = self.critic_net.train_critic(self.batch_states, self.batch_actions, self.batch_next_obj_Q, learning_rate[1])

            # Find gradients from the Q values (critic network) to the actions
            action_for_grad_Q2a = self.evaluate_actor(self.batch_states)
            if is_grad_inverter:
                self.grad_Q2a = self.critic_net.compute_grad_Q2a( self.batch_states, action_for_grad_Q2a )#/AC_BATCH_SIZE            
                self.grad_Q2a = self.grad_inv.invert( self.grad_Q2a, action_for_grad_Q2a )
            else:
                self.grad_Q2a = self.critic_net.compute_grad_Q2a( self.batch_states, action_for_grad_Q2a )[0]#/AC_BATCH_SIZE

            # Train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
            aerror = self.actor_net.train_actor(self.batch_states, self.batch_actions, self.grad_Q2a, learning_rate[0])
            #print("aerror: "+str(aerror))
            if update_target == 1:
                self.update_target_net()
        return cerror, aerror

    def update_target_net( self ):
        # Update target Critic and Actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()

    def train_arp( self, learning_rate=0.0001):
        """Train the arrival rate prediction network with a minibatch from the replay memory
        """
        lrm = len( self.replay_memory_arp )
        arperror = 1
        if( lrm >= ARP_BATCH_SIZE ):
            #print('in train_arp, lrm: '+str(lrm))
            # Sample a random minibatch of N transitions from R
            self.get_minibatch_arp()
            # Train ar prediction network

            arperror = self.ar_pred_net.train_ar_pred(self.his_ars, self.next_ars, learning_rate)
            #print('in train_arp, arperror: '+str(arperror))
        return arperror

    def train_lm( self, learning_rate=0.0001):
        """Train the load mapping network with a minibatch from the replay memory
        """
        lrm = len( self.replay_memory_lm )
        lmerror = 1
        if( lrm >= LM_BATCH_SIZE ):
            # Sample a random minibatch of N transitions from R
            self.get_minibatch_lm()
            # Train load mapping network
            lmerror = self.load_map_net.train_load_map(self.ar_action, self.mapped_load, learning_rate)
        return lmerror

    def find_action_neigh( self, state, p_action ):
        # get the proto_action's k nearest neighbors
        actions = self.action_space.search_point(p_action, self.k_nearest_neighbors)[0]
        # make all the state, action pairs for the critic
        states = np.tile(state, [len(actions), 1])
        # evaluate each pair through the critic
        actions_evaluation = self.critic_net.evaluate_critic(states, actions)
        # find the index of the pair with the maximum value
        max_index = np.argmax(actions_evaluation)
        # return the best action
        return actions[max_index]


    def close_all(self):
        self.actor_net.close_all()
        self.critic_net.close_all()
        self.ar_pred_net.close_all()
        self.load_map_net.close_all()

    
    def decay(self, i, minr, maxr, istep=1, estep=5000, method=1):
        """
        method=1: log
        method=2: linear
        method=3: inverse
        """
        if method == 1:     #log
            shift = (estep)**(-minr/maxr)
            scale = maxr*math.log(istep+shift)
            a = scale/(math.log(i +shift))
        if method == 2:     #linear
            scale = (maxr-minr)/(istep-estep)
            shift = maxr-scale*istep
            a = scale*i + shift
        if method == 3:      #inverse
            shift = (estep*minr-istep*maxr)/(maxr-minr)
            scale = maxr*(istep+shift)
            a = scale/(i+shift)
        return max(0, a)
コード例 #4
0
ファイル: ddpg.py プロジェクト: zxqzhang/ddpg-aigym
class DDPG:
    """ Deep Deterministic Policy Gradient Algorithm"""
    def __init__(self,env):
        self.env = env 
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        
        #Initialize Actor Network:
        action_bound = env.action_space.high
        self.critic_net = CriticNet(self.num_states, self.num_actions) #self.actor_net is an object
        self.actor_net = ActorNet(self.num_states, self.num_actions, action_bound)
        
        #Initialize Buffer Network:
        self.replay_memory = deque()
        
        #Intialize time step:
        self.time_step = 0
        
        #invert gradients (softthresholding)
        action_bounds = [[3], [-3]] #specify upper bound and lower bound of action space
        #action_bound structure for higher dimension actions[
        #[max_of_action_dim_0, max_of_action_dim_1, ..., max_of_action_dim_10], 
        #[min_of_action_dim_0, min_of_action_dim_1, ..., min_of_action_dim_10] 
        #]
        
        
        self.grad_inv = grad_inverter(action_bounds)
        
        
    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)
    
    def add_experience(self, observation_1, observation_2, action, reward, done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done))
        self.time_step = self.time_step + 1
        if(len(self.replay_memory)>REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()
            
        
    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        #state t
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        #state t+1        
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array( self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)  
                  
                 
    def train(self):
        #sample a random minibatch of N transitions from R
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch)
        #Q'(s_i+1,a_i+1)        
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) 
        
        
        self.y_i_batch=[]         
        for i in range(0,BATCH_SIZE):
                           
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:
                
                self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0])                 
        
        self.y_i_batch=np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1])
        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch)
        
        # Update actor proportional to the gradients:
        
        #actions for computing delQ/dela because 
        action_for_delQ = self.evaluate_actor(self.state_t_batch) #think of if you want to take this action or the action_t_batch itself:
        self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE
        self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ)
        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
               
        
        self.actor_net.train_actor(self.state_t_batch,self.del_Q_a)
 
        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
コード例 #5
0
ファイル: ddpg.py プロジェクト: w1368027790/DeepRobotics
class DDPG:
    """ Deep Deterministic Policy Gradient Algorithm"""
    def __init__(self, env, is_batch_norm):
        self.env = env
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.num_states, self.num_actions)
            self.actor_net = ActorNet_bn(self.num_states, self.num_actions)

        else:
            self.critic_net = CriticNet(self.num_states, self.num_actions)
            self.actor_net = ActorNet(self.num_states, self.num_actions)

        #Initialize Buffer Network:
        self.replay_memory = deque()

        #Intialize time step:
        self.time_step = 0
        self.counter = 0

        action_max = np.array(env.action_space.high).tolist()
        action_min = np.array(env.action_space.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)

    def add_experience(self, observation_1, observation_2, action, reward,
                       done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2,
                                   self.action, self.reward, self.done))
        self.time_step = self.time_step + 1
        if (len(self.replay_memory) > REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()

    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        #state t
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        #state t+1
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array(self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(
            self.action_batch, [len(self.action_batch), self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)

    def train(self):
        #sample a random minibatch of N transitions from R
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(
            self.state_t_1_batch)
        #Q'(s_i+1,a_i+1)
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,
                                                       self.action_t_1_batch)
        self.y_i_batch = []
        for i in range(0, BATCH_SIZE):

            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:

                self.y_i_batch.append(self.reward_batch[i] +
                                      GAMMA * q_t_1[i][0])

        self.y_i_batch = np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1])

        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,
                                     self.y_i_batch)

        # Update actor proportional to the gradients:
        action_for_delQ = self.evaluate_actor(self.state_t_batch)

        if is_grad_inverter:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)  #/BATCH_SIZE
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ)
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(
                self.state_t_batch, action_for_delQ)[0]  #/BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(self.state_t_batch, self.del_Q_a)

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
class DDPGAgent(Agent):
    ''' stevenpjg's implementation of DDPG algorithm '''

    REPLAY_MEMORY_SIZE = 10000
    BATCH_SIZE = 64
    GAMMA = 0.99

    def __init__(self,
                 env,
                 dir,
                 is_batch_norm=False,
                 is_grad_inverter=True,
                 training_flag=True):
        super().__init__(env, dir)
        assert isinstance(env.action_space,
                          Box), "action space must be continuous"

        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.observation_space_size,
                                           self.action_space_size)
            self.actor_net = ActorNet_bn(self.observation_space_size,
                                         self.action_space_size)

        else:
            self.critic_net = CriticNet(self.observation_space_size,
                                        self.action_space_size)
            self.actor_net = ActorNet(self.observation_space_size,
                                      self.action_space_size)

        self.is_grad_inverter = is_grad_inverter
        self.training_flag = training_flag
        self.replay_memory = deque()

        self.time_step = 0

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

        self.data_fetch = None

    def add_data_fetch(self, df):
        self.data_fetch = df

    def get_short_name(self):
        return 'DDPG'

    def act(self, state):
        state = self._np_shaping(state, True)
        result = self.actor_net.evaluate_actor(state).astype(float)

        if self.data_fetch:
            self.data_fetch.set_actors_action(result[0].tolist())

        return result

    def observe(self, episode):
        episode['obs'] = self._np_shaping(episode['obs'], True)
        episode['action'] = self._np_shaping(episode['action'], False)
        episode['obs2'] = self._np_shaping(episode['obs2'], True)

        self.add_experience(episode)

    def add_experience(self, episode):
        self.replay_memory.append(episode)

        self.time_step += 1
        if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

        if len(self.replay_memory) > type(self).BATCH_SIZE:
            res = self.train()
            return res
        else:
            return None

    def minibatches(self):
        batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
        # state t
        state = self._np_shaping(np.array([item['obs'] for item in batch]),
                                 True)
        # action
        action = self._np_shaping(np.array([item['action'] for item in batch]),
                                  False)
        # reward
        reward = np.array([item['reward'] for item in batch])
        # state t+1
        state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]),
                                   True)
        # doneA
        done = np.array([item['done'] for item in batch])

        return state, action, reward, state_2, done

    def train(self):
        if not self.training_flag:
            return

        # sample a random minibatch of N transitions from R
        state, action, reward, state_2, done = self.minibatches()

        actual_batch_size = len(state)

        target_action = self.actor_net.evaluate_target_actor(state)

        # Q'(s_i+1,a_i+1)
        q_t = self.critic_net.evaluate_target_critic(state_2, target_action)

        y = []
        for i in range(0, actual_batch_size):

            if done[i]:
                y.append(reward[i])
            else:
                y.append(reward[i] +
                         type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t

        y = np.reshape(np.array(y), [len(y), 1])

        # Update critic by minimizing the loss
        self.critic_net.train_critic(state, action, y)

        # Update actor proportional to the gradients:
        # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
        action_for_delQ = self.actor_net.evaluate_actor(
            state)  # dont need wolp action

        if self.is_grad_inverter:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)  # /BATCH_SIZE
            del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
        else:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)[0]  # /BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(state, del_Q_a)

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()

    def save_agent(self, force=False, comment="default"):
        path = "{}/weights/{}".format(self.get_dir(), comment)
        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)
            print("Saving agent in", path)
            self.actor_net.save_model(path + '/actor.ckpt')
            self.critic_net.save_model(path + '/critic.ckpt')
        else:
            if force:
                print("Overwrite old agent in", path)
                self.actor_net.save_model(path + '/actor.ckpt')
                self.critic_net.save_model(path + '/critic.ckpt')
            else:
                print("Save aborted. An agent is already saved in ", path)

    def load_agent(self, agent_name=None, comment="default"):
        if agent_name is None:
            path = "{}/weights/{}".format(self.get_dir(), comment)
        else:
            path = "{}/{}/{}/weights/{}".format(self.result_dir, agent_name,
                                                self.env.spec.id, comment)
        if os.path.exists(path):
            print("Loading agent saved in", path)
            self.actor_net.load_model(path + '/actor.ckpt')
            self.critic_net.load_model(path + '/critic.ckpt')
        else:
            print("Agent not found in", path)

    def close_session(self):
        self.actor_net.close()
        self.critic_net.close()
コード例 #7
0
ファイル: agent.py プロジェクト: ekrimsk/CS234_Final_Project
class DDPGAgent(Agent):
    ''' stevenpjg's implementation of DDPG algorithm '''

    REPLAY_MEMORY_SIZE = 10000
    BATCH_SIZE = 64
    GAMMA = 0.99

    def __init__(self, env, is_batch_norm=False, is_grad_inverter=True):
        super().__init__(env)

        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.observation_space_size,
                                           self.action_space_size)
            self.actor_net = ActorNet_bn(self.observation_space_size,
                                         self.action_space_size)

        else:
            self.critic_net = CriticNet(self.observation_space_size,
                                        self.action_space_size)
            self.actor_net = ActorNet(self.observation_space_size,
                                      self.action_space_size)

        self.is_grad_inverter = is_grad_inverter
        self.replay_memory = deque()

        self.time_step = 0

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

    # EREZ ADDED
    def save(self, path):

        # TODO -- robust handling of where to put things

        # Everything from the super class can be pickled easily
        attrs = Container()
        saved_critic = self.critic_net.save(path)
        saved_actor = self.actor_net.save(path)

        i_vars = vars(self)
        keys = i_vars.keys()
        for key in keys:
            tmp = getattr(self, key)

            if not (isinstance(tmp, (CriticNet, ActorNet, grad_inverter))):
                setattr(attrs, key, tmp)

        file = os.path.join(
            path, "agent_data.pkl")  # TODO -- come up with a not stupid name
        with open(file, "wb") as f:
            pickle.dump(attrs, f, pickle.HIGHEST_PROTOCOL)

    # EREZ ADDED
    # def restore(self, file): -- write now its just in the subclass
    def restore(self, path):
        print("restoring the agent")

        file = os.path.join(
            path, "agent_data.pkl")  # TODO -- come up with a not stupid name

        with open(file, "rb") as f:
            dump = pickle.load(f)

            i_vars = vars(dump)
            keys = i_vars.keys()
            for key in keys:
                tmp = getattr(dump, key)
                setattr(self, key, tmp)

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

        # Now replace the networks
        # IGNORE THE "IS BATCH " CONDITION FOR NOW
        saved_critic_net = CriticNet(self.observation_space_size,
                                     self.action_space_size)
        saved_actor_net = ActorNet(self.observation_space_size,
                                   self.action_space_size)

        # Load in the saved graphs
        critic_file = os.path.join(path, "critic_net.ckpt")
        saved_critic_net.restore(critic_file)
        actor_file = os.path.join(path, "actor_net.ckpt")
        saved_actor_net.restore(actor_file)

        self.critic_net = saved_critic_net
        self.actor_net = saved_actor_net

    def add_data_fetch(self, df):
        self.data_fetch = df

    def get_name(self):
        return 'DDPG' + super().get_name()

    def act(self, state):
        state = self._np_shaping(state, True)
        result = self.actor_net.evaluate_actor(state).astype(float)
        self.data_fetch.set_actors_action(result[0].tolist())
        return result

    def observe(self, episode):
        episode['obs'] = self._np_shaping(episode['obs'], True)
        episode['action'] = self._np_shaping(episode['action'], False)
        episode['obs2'] = self._np_shaping(episode['obs2'], True)
        self.add_experience(episode)

    def add_experience(self, episode):
        self.replay_memory.append(episode)

        self.time_step += 1
        if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

        if len(self.replay_memory) > type(self).BATCH_SIZE:
            res = self.train()
            return res
        else:
            return None

    def minibatches(self):
        batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
        # state t
        state = self._np_shaping(np.array([item['obs'] for item in batch]),
                                 True)
        # action
        action = self._np_shaping(np.array([item['action'] for item in batch]),
                                  False)
        # reward
        reward = np.array([item['reward'] for item in batch])
        # state t+1
        state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]),
                                   True)
        # doneA
        done = np.array([item['done'] for item in batch])

        return state, action, reward, state_2, done

    def train(self):
        # sample a random minibatch of N transitions from R
        state, action, reward, state_2, done = self.minibatches()

        actual_batch_size = len(state)

        target_action = self.actor_net.evaluate_target_actor(state)

        # Q'(s_i+1,a_i+1)
        q_t = self.critic_net.evaluate_target_critic(state_2, target_action)

        y = []
        for i in range(0, actual_batch_size):

            if done[i]:
                y.append(reward[i])
            else:
                y.append(reward[i] +
                         type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t

        y = np.reshape(np.array(y), [len(y), 1])

        # Update critic by minimizing the loss
        self.critic_net.train_critic(state, action, y)

        # Update actor proportional to the gradients:
        # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
        action_for_delQ = self.actor_net.evaluate_actor(
            state)  # dont need wolp action

        if self.is_grad_inverter:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)  # /BATCH_SIZE
            del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
        else:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)[0]  # /BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(state, del_Q_a)

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
コード例 #8
0
ファイル: ddpg.py プロジェクト: stevenpjg/ddpg-aigym
class DDPG:
    
    """ Deep Deterministic Policy Gradient Algorithm"""
    def __init__(self,env, is_batch_norm):
        self.env = env 
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        
        
        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.num_states, self.num_actions) 
            self.actor_net = ActorNet_bn(self.num_states, self.num_actions)
            
        else:
            self.critic_net = CriticNet(self.num_states, self.num_actions) 
            self.actor_net = ActorNet(self.num_states, self.num_actions)
        
        #Initialize Buffer Network:
        self.replay_memory = deque()
        
        #Intialize time step:
        self.time_step = 0
        self.counter = 0
        
        action_max = np.array(env.action_space.high).tolist()
        action_min = np.array(env.action_space.low).tolist()        
        action_bounds = [action_max,action_min] 
        self.grad_inv = grad_inverter(action_bounds)
        
        
    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)
    
    def add_experience(self, observation_1, observation_2, action, reward, done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done))
        self.time_step = self.time_step + 1
        if(len(self.replay_memory)>REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()
            
        
    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        #state t
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        #state t+1        
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array( self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)  
                  
                 
    def train(self):
        #sample a random minibatch of N transitions from R
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch)
        #Q'(s_i+1,a_i+1)        
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) 
        self.y_i_batch=[]         
        for i in range(0,BATCH_SIZE):
                           
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:
                
                self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0])                 
        
        self.y_i_batch=np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1])
        
        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch)
        
        # Update actor proportional to the gradients:
        action_for_delQ = self.evaluate_actor(self.state_t_batch) 
        
        if is_grad_inverter:        
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE            
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) 
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)[0]#/BATCH_SIZE
        
        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(self.state_t_batch,self.del_Q_a)
 
        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()
コード例 #9
0
class RDPG:
    """Recurrent Policy Gradient Algorithm"""
    def __init__(self, env, N_STATES, N_ACTIONS, STEPS, BATCH_SIZE):
        self.N_STATES = N_STATES
        self.N_ACTIONS = N_ACTIONS
        self.STEPS = STEPS
        self.BATCH_SIZE = BATCH_SIZE  #mini batch size
        self.critic_net = CriticNet(self.N_STATES, self.N_ACTIONS, self.STEPS,
                                    self.BATCH_SIZE)
        self.actor_net = ActorNet(self.N_STATES, self.N_ACTIONS, self.STEPS,
                                  self.BATCH_SIZE)
        self.R = []

    def evaluate_actor(self, state, t):
        #converting state to a 3D tensor to feed into lstms
        if t == 0:
            self.state_matrix = np.zeros(
                [self.BATCH_SIZE, self.STEPS, self.N_STATES])
            self.state_matrix[0, t, :] = state
        else:
            self.state_matrix[0, t, :] = state
#        print self.state_matrix
#        raw_input('Enter to continue')
        return self.actor_net.evaluate_actor(self.state_matrix)

    def add_to_replay(self, h_t, i):
        ##STORE THE SEQUENCE (o_1,a_1,r_1,....,o_t,a_t,r_t) in R
        self.h_t = h_t
        self.R.append(h_t)
        if (len(self.R) > BUFFER_SIZE):
            self.R.pop(0)

    def sample_mini_batches(self):
        self.indices = np.random.randint(1,
                                         len(self.R),
                                         size=(1, self.BATCH_SIZE))
        self.R_mini_batch = [None] * self.BATCH_SIZE
        for i in range(0, len(self.indices[0, :])):
            self.R_mini_batch[i] = self.R[self.indices[0][i]]

        #reward_t (batchsize x timestep)
        self.r_n_tl = [None] * self.BATCH_SIZE
        for i in range(0, len(self.r_n_tl)):
            self.r_n_tl[i] = self.R_mini_batch[i][:, -1]

        self.r_n_t = np.zeros([self.BATCH_SIZE, self.STEPS])

        for i in range(0, self.BATCH_SIZE):
            self.r_n_t[i, 0:len(self.r_n_tl[i])] = self.r_n_tl[i]

        #observation list (batchsize x timestep)
        self.o_n_tl = [None] * self.BATCH_SIZE
        for i in range(0, len(self.o_n_tl)):
            self.o_n_tl[i] = self.R_mini_batch[i][:, 0:self.N_STATES]

        self.o_n_t = np.zeros([self.BATCH_SIZE, self.STEPS, self.N_STATES])
        for i in range(0, self.BATCH_SIZE):
            self.o_n_t[i, 0:len(self.o_n_tl[i]), :] = self.o_n_tl[i]

        #action list:
        #observation list (batchsize x timestep)
        self.a_n_tl = [None] * self.BATCH_SIZE
        for i in range(0, len(self.a_n_tl)):
            self.a_n_tl[i] = self.R_mini_batch[i][:,
                                                  self.N_STATES:self.N_STATES +
                                                  self.N_ACTIONS]

        self.a_n_t = np.zeros([self.BATCH_SIZE, self.STEPS, self.N_ACTIONS])
        for i in range(0, self.BATCH_SIZE):
            self.a_n_t[i, 0:len(self.a_n_tl[i]), :] = self.a_n_tl[i]

    def train(self):
        self.sample_mini_batches()
        #Action at h_t+1:
        self.t_a_ht1 = self.actor_net.evaluate_target_actor(self.o_n_t)
        #State Action value at h_t+1:

        self.t_qht1 = self.critic_net.evaluate_target_critic(
            self.o_n_t, self.t_a_ht1)
        self.check = self.t_qht1

        ##COMPUTE TARGET VALUES FOR EACH SAMPLE EPISODE (y_1,y_2,....y_t) USING THE RECURRENT TARGET NETWORKS
        self.y_n_t = []
        self.r_n_t = np.reshape(self.r_n_t, [self.BATCH_SIZE, self.STEPS, 1])

        for i in range(0, self.STEPS):
            if (i == self.STEPS - 1):
                self.y_n_t.append(self.r_n_t[:, i])
            else:
                self.y_n_t.append(self.r_n_t[:, i, :] +
                                  GAMMA * self.t_qht1[:, i + 1, :])
        self.y_n_t = np.vstack(self.y_n_t)
        self.y_n_t = self.y_n_t.T  #(batchsize x timestep)
        self.y_n_t = np.reshape(self.y_n_t, [
            self.BATCH_SIZE, self.STEPS, 1
        ])  #reshape y_n_t to have shape (batchsize,timestep,no.dimensions)
        ##COMPUTE CRITIC UPDATE (USING BPTT)
        self.critic_net.train_critic(self.o_n_t, self.a_n_t, self.y_n_t)

        #action for computing critic gradient
        self.a_ht = self.actor_net.evaluate_actor_batch(
            self.o_n_t)  #returns output as 3d array
        #critic gradient with respect to action delQ/dela
        self.del_Q_a = self.critic_net.compute_critic_gradient(
            self.o_n_t, self.a_ht)
        ##COMPUTE ACTOR UPDATE (USING BPTT)
        self.actor_net.train_actor(self.o_n_t, self.del_Q_a)
        ##Update the target networks
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()