class DDPG: """ Deep Deterministic Policy Gradient Algorithm. Sourced By: https://github.com/stevenpjg/ddpg-aigym/blob/master/ddpg.py """ def __init__(self, num_states, num_actions, action_space_high, action_space_low, is_batch_norm): self.num_states = num_states self.num_actions = num_actions self.action_space_high = action_space_high self.action_space_low = action_space_low # Batch normalisation disabled. self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) # Replay Memory 초기화 self.replay_memory = deque() # time 초기화 self.time_step = 0 self.counter = 0 action_max = np.array(action_space_high).tolist() action_min = np.array(action_space_low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) # observation_1 = state at time t # observation 2 = state at time (t + 1) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward, self.done)) self.time_step = self.time_step + 1 # Replay memory 가 가득차면 맨 첫 번째 memory 를 삭제한다 if (len(self.replay_memory) > REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): # BATCH_SIZE 만큼 replay memory에서 가져온다. batch = random.sample(self.replay_memory, BATCH_SIZE) # S(t) 와 S(T + 1), action, reward, done 에 대한 batch를 # 각각 따로 저장한다 self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array(self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape( self.action_batch, [len(self.action_batch), self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): print "######## Starting to train..." # batch 뽑기 self.minibatches() # S(t + 1) 정보를 가지고 time (t + 1)에서의 action batch 생성 self.action_t_1_batch = self.actor_net.evaluate_target_actor( self.state_t_1_batch) # Q`(S(t + 1), a(t + 1)) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch, self.action_t_1_batch) print "#### Evaluated ciritic value(Q value)" print q_t_1 self.y_i_batch = [] # reward batch 의 item 을 가공하여 저장하는 곳 for i in range(0, BATCH_SIZE): # done == True 이면 terminal state로 간 것이므로 # 이 때의 reward 를 정답상태로 갔을 때의 reward 라고 할 수 있다 if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) # False 이면 terminal state 는 아니므로 reward에 (감마 * Q value) 값을 더한다 else: self.y_i_batch.append(self.reward_batch[i] + GAMMA * q_t_1[i][0]) self.y_i_batch = np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1]) # loss 를 최소화하여 critic network 를 업데이트 한다 # weight 을 업데이트 하는데 (y_i_batch - (state_t_batch, action_batch) 에서 예측한 y value) 가 최소가 되도록 한다 self.critic_net.train_critic(self.state_t_batch, self.action_batch, self.y_i_batch) # gradient 에 따라 actor 를 업데이트 한다 action_for_delQ = self.evaluate_actor(self.state_t_batch) if is_grad_inverter: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ) self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ) else: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ)[0] # actor network 학습 self.actor_net.train_actor(self.state_t_batch, self.del_Q_a) # target critic, target actor network 업데이트 self.critic_net.update_target_critic() self.actor_net.update_target_actor() self.critic_net.save_critic("model/critic_model.ckpt") self.actor_net.save_actor("model/actor_model.ckpt") print "######## Finish to train ..."
class DDPGAgent(Agent): ''' stevenpjg's implementation of DDPG algorithm ''' REPLAY_MEMORY_SIZE = 10000 BATCH_SIZE = 64 GAMMA = 0.99 def __init__(self, env, is_batch_norm=False, is_grad_inverter=True): super().__init__(env) assert isinstance(env.action_space, Box), "action space must be continuous" if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) def add_data_fetch(self, df): self.data_fetch = df self.data_fetch.add_timers(['ev_p_t', 'ev_q_t', 'y', 'train_q', 'train_p', 'up_q_t', 'up_p_t'], prefix='t_agent_training_') self.data_fetch.add_array('actors_result') def get_name(self): return 'DDPG' + super().get_name() def act(self, state): state = self._np_shaping(state, True) result = self.actor_net.evaluate_actor(state).astype(float) self.data_fetch.add_to_array('actors_result', result) return result def observe(self, episode): episode['obs'] = self._np_shaping(episode['obs'], True) episode['action'] = self._np_shaping(episode['action'], False) episode['obs2'] = self._np_shaping(episode['obs2'], True) self.add_experience(episode) def add_experience(self, episode): self.replay_memory.append(episode) self.time_step += 1 if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: self.replay_memory.popleft() if len(self.replay_memory) > type(self).BATCH_SIZE: res = self.train() return res else: return None def minibatches(self): batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) # state t state = self._np_shaping(np.array([item['obs'] for item in batch]), True) # action action = self._np_shaping(np.array([item['action'] for item in batch]), False) # reward reward = np.array([item['reward'] for item in batch]) # state t+1 state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) # doneA done = np.array([item['done'] for item in batch]) return state, action, reward, state_2, done def train(self): # sample a random minibatch of N transitions from R state, action, reward, state_2, done = self.minibatches() actual_batch_size = len(state) self.data_fetch.reset_timers() target_action = self.actor_net.evaluate_target_actor(state) self.data_fetch.sample_timer('ev_p_t') # ------ # Q'(s_i+1,a_i+1) q_t = self.critic_net.evaluate_target_critic(state_2, target_action) self.data_fetch.sample_timer('ev_q_t') # ------ y = [] for i in range(0, actual_batch_size): if done[i]: y.append(reward[i]) else: y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t y = np.reshape(np.array(y), [len(y), 1]) self.data_fetch.sample_timer('y') # ------ # Update critic by minimizing the loss self.critic_net.train_critic(state, action, y) self.data_fetch.sample_timer('train_q') # ------ # Update actor proportional to the gradients: # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act action_for_delQ = self.actor_net.evaluate_actor(state) # dont need wolp action if self.is_grad_inverter: del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ) # /BATCH_SIZE del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) else: del_Q_a = self.critic_net.compute_delQ_a(state, action_for_delQ)[0] # /BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(state, del_Q_a) self.data_fetch.sample_timer('train_p') # ------ # Update target Critic and actor network self.critic_net.update_target_critic() self.data_fetch.sample_timer('up_q_t') # ------ self.actor_net.update_target_actor() self.data_fetch.sample_timer('up_p_t') # ------
class DDPG: """ Deep Deterministic Policy Gradient Algorithm hisar_size: size of the history ar vector/tensor action_size: size of the action vector/tensor TAU: update rate of target network parameters is_batch_norm: if apply batch norm write_sum: key/interval for writing summary data to file """ def __init__( self, hisar_size, ar_size, action_size, TAU = 0.001, is_batch_norm = 0, write_sum = 0, net_size_scale=1, max_load=1, beta0=beta): self.hisar_size = hisar_size self.load_size = action_size + 1 self.ar_size = ar_size self.state_size = action_size * 2 self.action_size = action_size self.ar_action_size = ar_size + action_size #print("net_size_scale: "+str(net_size_scale)) if is_batch_norm: if len(CN_N_HIDDENS)==2: self.critic_net = CriticNet_bn( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) else: self.critic_net = CriticNet_bn_3( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.actor_net = ActorNet_bn( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.ar_pred_net = ARPredNet_bn( self.hisar_size, self.ar_size, write_sum, net_size_scale ) # arrival rate prediction network self.load_map_net = LoadMapNet_bn( self.ar_size, self.action_size, self.load_size, write_sum, net_size_scale ) # load mapping network else: self.critic_net = CriticNet( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.actor_net = ActorNet( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.ar_pred_net = ARPredNet( self.hisar_size, self.ar_size, write_sum, net_size_scale ) # arrival rate prediction network self.load_map_net = LoadMapNet( self.ar_size, self.action_size, self.load_size, write_sum, net_size_scale ) # load mapping network self.env = ENV( action_size, max_load=max_load, beta0=beta0 ) #self.k_nearest_neighbors = int(max_actions * k_ratio ) #Initialize Network Buffers: self.replay_memory_ac = deque() self.replay_memory_arp = deque() self.replay_memory_lm = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.ones( ( self.action_size ) ).tolist() action_min = np.zeros( ( self.action_size ) ).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter( action_bounds ) def construct_state( self, pred_ar, pre_action=[] ): """Construct a state with the predicted ar and previous action """ num_sbs = np.max( pred_ar.shape ) pred_ar = np.reshape( np.array( pred_ar ), (1, num_sbs) ) pre_action = np.reshape( np.array( pre_action ), (1, num_sbs) ) state = np.concatenate( (pred_ar, pre_action), axis=1 ) return state.tolist() def evaluate_actor( self, state_t ): """Evaluate the actor network to get an action """ p_action = self.actor_net.evaluate_actor( state_t ) return p_action def add_experience_ac( self, state, next_state, action, reward ): """Add data sample of the Actor-Critic network """ self.state = state self.next_state = next_state self.action = action self.reward = reward #if reward>0: self.replay_memory_ac.append( (self.state, self.next_state, self.action, self.reward) ) self.time_step = self.time_step + 1 if( len(self.replay_memory_ac) > AC_REPLAY_MEMORY_SIZE ): self.replay_memory_ac.popleft() def add_experience_arp( self, his_ar, pred_ar ): """Add data sample of the arrival rate prediction network """ self.replay_memory_arp.append( (his_ar, pred_ar) ) if( len(self.replay_memory_arp) > ARP_REPLAY_MEMORY_SIZE ): self.replay_memory_arp.popleft() def add_experience_lm( self, ar_action, mapped_load ): """Add data sample of the load mapping network """ self.replay_memory_lm.append( (ar_action, mapped_load) ) if( len(self.replay_memory_lm) > LM_REPLAY_MEMORY_SIZE ): self.replay_memory_lm.popleft() def refine_action(self, state, action_in, imp = 0): """ round up the action to [0,1], then if imp>0, get the p_action's nearest neighbors, return the one with the max metric value, imp==1, metric = Q value; imp==2, metric = Q value + reward; imp==3, metric = reward. """ action0 = np.round( action_in ) action = np.clip( action0, 0, 1 ) #print("in refine action: "+str(action)) if imp>0: action = self.improve_action(state, action, imp) return action def improve_action(self, state, p_action, greedy=1): """ get the p_action's nearest neighbors, return the one with the max metric value greedy==1, metric = Q value greedy==2, metric = Q value + reward greedy==3, metric = reward """ state0 = state[0] ac_size = np.max(p_action.shape) ar_size = len(state0)-ac_size p_action = np.array(p_action) # if the action would cause outage, greedily modify the action pred_ar = state0[0:ar_size] # predicted ar pred_ar = np.reshape( pred_ar, [1, ar_size] ) prev_action = state0[ac_size: ] #print("p_action: "+str(p_action)) reward = -1 while reward < 0: map_load = self.load_map_net.evaluate_load_map( pred_ar, [p_action] ) map_load[0][-1] += 0.05 # conservatively estimate the load of the mbs reward, _, _, _, _ = self.env.find_reward( map_load[0], p_action, prev_action ) #print("est reward: "+str(reward)) if reward < 0: t_ar = [a*(1-b) for a,b in zip(pred_ar[0], p_action)] if max(t_ar)==0: #print("---------------------tried best, still negative reward, break...") break max_index = np.argmax( t_ar ) p_action[max_index] = 1 #print("---------------------negative reward, change the "+str(max_index)+" action to 1") # find the nearest neighbors actions = np.zeros( (ac_size+1, ac_size) ) for i in range(0, ac_size): t_action = copy.deepcopy(p_action) t_action[i] = 1-t_action[i] actions[ i] = t_action actions[ac_size] = copy.deepcopy(p_action) metrics = np.zeros( ( ac_size+1 ) ) if greedy <=2: # make all the (state, action) pairs for the critic states = np.tile(state, [len(actions), 1]) # evaluate each pair through the critic q_values = self.critic_net.evaluate_critic(states, actions) # find the index of the pair with the maximum value metrics += np.reshape( q_values, ( ac_size+1 ) ) #print("q values: "+str(metrics)) if greedy >=2: rewards = np.zeros( ( ac_size+1 ) ) for i in range(0,ac_size+1): taction = np.reshape( actions[i], [1, ac_size] ) map_load = self.load_map_net.evaluate_load_map( pred_ar, taction ) map_load[0][-1] += 0.02 # conservatively estimate the load of the mbs rewards[i], _, _, _, _ = self.env.find_reward( map_load[0], actions[i], prev_action ) metrics = rewards + GAMMA*metrics max_index = np.argmax( metrics ) # action_out = actions[max_index] #if max_index != ac_size: # print("Improve "+str(p_action)+" to "+str(action_out)) # return the best action return action_out def get_minibatch_ac(self): """Get mini batch for training of actor-critic network """ batch = random.sample( self.replay_memory_ac, AC_BATCH_SIZE ) #state t self.batch_states = [item[0] for item in batch] self.batch_states = np.reshape( np.array(self.batch_states), (AC_BATCH_SIZE, self.state_size ) ) #state t+1 self.batch_next_states = [item[1] for item in batch] self.batch_next_states = np.reshape( np.array( self.batch_next_states), (AC_BATCH_SIZE, self.state_size ) ) self.batch_actions = [item[2] for item in batch] self.batch_actions = np.reshape( np.array( self.batch_actions), [len(self.batch_actions), self.action_size] ) self.batch_rewards = [item[3] for item in batch] self.batch_rewards = np.array( self.batch_rewards ) def get_minibatch_arp(self): """Get mini batch for training of arrival rate prediction network """ batch = random.sample( self.replay_memory_arp, ARP_BATCH_SIZE ) #history ars self.his_ars = [item[0] for item in batch] #print("his_ars: "+str(np.array(self.his_ars))) self.his_ars = np.reshape( np.array(self.his_ars), (ARP_BATCH_SIZE, self.hisar_size) ) #state t+1 self.next_ars = [item[1] for item in batch] self.next_ars = np.reshape( np.array( self.next_ars), (ARP_BATCH_SIZE, self.ar_size ) ) def get_minibatch_lm(self): """Get mini batch for training of load mapping network """ batch = random.sample( self.replay_memory_lm, LM_BATCH_SIZE ) #history ars self.ar_action = [item[0] for item in batch] self.ar_action = np.reshape( np.array(self.ar_action), (LM_BATCH_SIZE, self.ar_action_size) ) #state t+1 self.mapped_load = [item[1] for item in batch] self.mapped_load = np.reshape( np.array( self.mapped_load), (LM_BATCH_SIZE, self.load_size ) ) def train_ac( self, learning_rate=[0.0001, 0.0001], update_target=0): """Train actor-critic network with a minibatch from the replay memory """ cerror = 1 aerror = 1 if( len( self.replay_memory_ac ) > AC_BATCH_SIZE ): # Sample a random minibatch of N transitions from R self.get_minibatch_ac() self.batch_next_taction = self.actor_net.evaluate_target_actor( self.batch_next_states ) # Q'(s_i+1,a_i+1) batch_next_tQ = self.critic_net.evaluate_target_critic( self.batch_next_states, self.batch_next_taction ) # r + gamma*Q'(s_i+1,a_i+1) self.batch_next_obj_Q = [] for i in range(0,AC_BATCH_SIZE): self.batch_next_obj_Q.append( self.batch_rewards[i] + GAMMA*batch_next_tQ[i][0] ) self.batch_next_obj_Q = np.array( self.batch_next_obj_Q ) self.batch_next_obj_Q = np.reshape( self.batch_next_obj_Q, [len(self.batch_next_obj_Q),1] ) #print("tQ: "+str(np.reshape( batch_next_tQ, [1, len(batch_next_tQ)]) ) ) # Update critic by minimizing the loss cerror = self.critic_net.train_critic(self.batch_states, self.batch_actions, self.batch_next_obj_Q, learning_rate[1]) # Find gradients from the Q values (critic network) to the actions action_for_grad_Q2a = self.evaluate_actor(self.batch_states) if is_grad_inverter: self.grad_Q2a = self.critic_net.compute_grad_Q2a( self.batch_states, action_for_grad_Q2a )#/AC_BATCH_SIZE self.grad_Q2a = self.grad_inv.invert( self.grad_Q2a, action_for_grad_Q2a ) else: self.grad_Q2a = self.critic_net.compute_grad_Q2a( self.batch_states, action_for_grad_Q2a )[0]#/AC_BATCH_SIZE # Train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: aerror = self.actor_net.train_actor(self.batch_states, self.batch_actions, self.grad_Q2a, learning_rate[0]) #print("aerror: "+str(aerror)) if update_target == 1: self.update_target_net() return cerror, aerror def update_target_net( self ): # Update target Critic and Actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor() def train_arp( self, learning_rate=0.0001): """Train the arrival rate prediction network with a minibatch from the replay memory """ lrm = len( self.replay_memory_arp ) arperror = 1 if( lrm >= ARP_BATCH_SIZE ): #print('in train_arp, lrm: '+str(lrm)) # Sample a random minibatch of N transitions from R self.get_minibatch_arp() # Train ar prediction network arperror = self.ar_pred_net.train_ar_pred(self.his_ars, self.next_ars, learning_rate) #print('in train_arp, arperror: '+str(arperror)) return arperror def train_lm( self, learning_rate=0.0001): """Train the load mapping network with a minibatch from the replay memory """ lrm = len( self.replay_memory_lm ) lmerror = 1 if( lrm >= LM_BATCH_SIZE ): # Sample a random minibatch of N transitions from R self.get_minibatch_lm() # Train load mapping network lmerror = self.load_map_net.train_load_map(self.ar_action, self.mapped_load, learning_rate) return lmerror def find_action_neigh( self, state, p_action ): # get the proto_action's k nearest neighbors actions = self.action_space.search_point(p_action, self.k_nearest_neighbors)[0] # make all the state, action pairs for the critic states = np.tile(state, [len(actions), 1]) # evaluate each pair through the critic actions_evaluation = self.critic_net.evaluate_critic(states, actions) # find the index of the pair with the maximum value max_index = np.argmax(actions_evaluation) # return the best action return actions[max_index] def close_all(self): self.actor_net.close_all() self.critic_net.close_all() self.ar_pred_net.close_all() self.load_map_net.close_all() def decay(self, i, minr, maxr, istep=1, estep=5000, method=1): """ method=1: log method=2: linear method=3: inverse """ if method == 1: #log shift = (estep)**(-minr/maxr) scale = maxr*math.log(istep+shift) a = scale/(math.log(i +shift)) if method == 2: #linear scale = (maxr-minr)/(istep-estep) shift = maxr-scale*istep a = scale*i + shift if method == 3: #inverse shift = (estep*minr-istep*maxr)/(maxr-minr) scale = maxr*(istep+shift) a = scale/(i+shift) return max(0, a)
class DDPG: """ Deep Deterministic Policy Gradient Algorithm""" def __init__(self,env): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] #Initialize Actor Network: action_bound = env.action_space.high self.critic_net = CriticNet(self.num_states, self.num_actions) #self.actor_net is an object self.actor_net = ActorNet(self.num_states, self.num_actions, action_bound) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 #invert gradients (softthresholding) action_bounds = [[3], [-3]] #specify upper bound and lower bound of action space #action_bound structure for higher dimension actions[ #[max_of_action_dim_0, max_of_action_dim_1, ..., max_of_action_dim_10], #[min_of_action_dim_0, min_of_action_dim_1, ..., min_of_action_dim_10] #] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done)) self.time_step = self.time_step + 1 if(len(self.replay_memory)>REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): batch = random.sample(self.replay_memory, BATCH_SIZE) #state t self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) #state t+1 self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array( self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): #sample a random minibatch of N transitions from R self.minibatches() self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch) #Q'(s_i+1,a_i+1) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) self.y_i_batch=[] for i in range(0,BATCH_SIZE): if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) else: self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0]) self.y_i_batch=np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1]) # Update critic by minimizing the loss self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch) # Update actor proportional to the gradients: #actions for computing delQ/dela because action_for_delQ = self.evaluate_actor(self.state_t_batch) #think of if you want to take this action or the action_t_batch itself: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(self.state_t_batch,self.del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()
class DDPG: """ Deep Deterministic Policy Gradient Algorithm""" def __init__(self, env, is_batch_norm): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward, self.done)) self.time_step = self.time_step + 1 if (len(self.replay_memory) > REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): batch = random.sample(self.replay_memory, BATCH_SIZE) #state t self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) #state t+1 self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array(self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape( self.action_batch, [len(self.action_batch), self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): #sample a random minibatch of N transitions from R self.minibatches() self.action_t_1_batch = self.actor_net.evaluate_target_actor( self.state_t_1_batch) #Q'(s_i+1,a_i+1) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch, self.action_t_1_batch) self.y_i_batch = [] for i in range(0, BATCH_SIZE): if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) else: self.y_i_batch.append(self.reward_batch[i] + GAMMA * q_t_1[i][0]) self.y_i_batch = np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch, [len(self.y_i_batch), 1]) # Update critic by minimizing the loss self.critic_net.train_critic(self.state_t_batch, self.action_batch, self.y_i_batch) # Update actor proportional to the gradients: action_for_delQ = self.evaluate_actor(self.state_t_batch) if is_grad_inverter: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ) #/BATCH_SIZE self.del_Q_a = self.grad_inv.invert(self.del_Q_a, action_for_delQ) else: self.del_Q_a = self.critic_net.compute_delQ_a( self.state_t_batch, action_for_delQ)[0] #/BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(self.state_t_batch, self.del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()
class DDPGAgent(Agent): ''' stevenpjg's implementation of DDPG algorithm ''' REPLAY_MEMORY_SIZE = 10000 BATCH_SIZE = 64 GAMMA = 0.99 def __init__(self, env, dir, is_batch_norm=False, is_grad_inverter=True, training_flag=True): super().__init__(env, dir) assert isinstance(env.action_space, Box), "action space must be continuous" if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.training_flag = training_flag self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) self.data_fetch = None def add_data_fetch(self, df): self.data_fetch = df def get_short_name(self): return 'DDPG' def act(self, state): state = self._np_shaping(state, True) result = self.actor_net.evaluate_actor(state).astype(float) if self.data_fetch: self.data_fetch.set_actors_action(result[0].tolist()) return result def observe(self, episode): episode['obs'] = self._np_shaping(episode['obs'], True) episode['action'] = self._np_shaping(episode['action'], False) episode['obs2'] = self._np_shaping(episode['obs2'], True) self.add_experience(episode) def add_experience(self, episode): self.replay_memory.append(episode) self.time_step += 1 if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: self.replay_memory.popleft() if len(self.replay_memory) > type(self).BATCH_SIZE: res = self.train() return res else: return None def minibatches(self): batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) # state t state = self._np_shaping(np.array([item['obs'] for item in batch]), True) # action action = self._np_shaping(np.array([item['action'] for item in batch]), False) # reward reward = np.array([item['reward'] for item in batch]) # state t+1 state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) # doneA done = np.array([item['done'] for item in batch]) return state, action, reward, state_2, done def train(self): if not self.training_flag: return # sample a random minibatch of N transitions from R state, action, reward, state_2, done = self.minibatches() actual_batch_size = len(state) target_action = self.actor_net.evaluate_target_actor(state) # Q'(s_i+1,a_i+1) q_t = self.critic_net.evaluate_target_critic(state_2, target_action) y = [] for i in range(0, actual_batch_size): if done[i]: y.append(reward[i]) else: y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t y = np.reshape(np.array(y), [len(y), 1]) # Update critic by minimizing the loss self.critic_net.train_critic(state, action, y) # Update actor proportional to the gradients: # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act action_for_delQ = self.actor_net.evaluate_actor( state) # dont need wolp action if self.is_grad_inverter: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ) # /BATCH_SIZE del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) else: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ)[0] # /BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(state, del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor() def save_agent(self, force=False, comment="default"): path = "{}/weights/{}".format(self.get_dir(), comment) if not os.path.exists(path): os.makedirs(path, exist_ok=True) print("Saving agent in", path) self.actor_net.save_model(path + '/actor.ckpt') self.critic_net.save_model(path + '/critic.ckpt') else: if force: print("Overwrite old agent in", path) self.actor_net.save_model(path + '/actor.ckpt') self.critic_net.save_model(path + '/critic.ckpt') else: print("Save aborted. An agent is already saved in ", path) def load_agent(self, agent_name=None, comment="default"): if agent_name is None: path = "{}/weights/{}".format(self.get_dir(), comment) else: path = "{}/{}/{}/weights/{}".format(self.result_dir, agent_name, self.env.spec.id, comment) if os.path.exists(path): print("Loading agent saved in", path) self.actor_net.load_model(path + '/actor.ckpt') self.critic_net.load_model(path + '/critic.ckpt') else: print("Agent not found in", path) def close_session(self): self.actor_net.close() self.critic_net.close()
class DDPGAgent(Agent): ''' stevenpjg's implementation of DDPG algorithm ''' REPLAY_MEMORY_SIZE = 10000 BATCH_SIZE = 64 GAMMA = 0.99 def __init__(self, env, is_batch_norm=False, is_grad_inverter=True): super().__init__(env) if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) # EREZ ADDED def save(self, path): # TODO -- robust handling of where to put things # Everything from the super class can be pickled easily attrs = Container() saved_critic = self.critic_net.save(path) saved_actor = self.actor_net.save(path) i_vars = vars(self) keys = i_vars.keys() for key in keys: tmp = getattr(self, key) if not (isinstance(tmp, (CriticNet, ActorNet, grad_inverter))): setattr(attrs, key, tmp) file = os.path.join( path, "agent_data.pkl") # TODO -- come up with a not stupid name with open(file, "wb") as f: pickle.dump(attrs, f, pickle.HIGHEST_PROTOCOL) # EREZ ADDED # def restore(self, file): -- write now its just in the subclass def restore(self, path): print("restoring the agent") file = os.path.join( path, "agent_data.pkl") # TODO -- come up with a not stupid name with open(file, "rb") as f: dump = pickle.load(f) i_vars = vars(dump) keys = i_vars.keys() for key in keys: tmp = getattr(dump, key) setattr(self, key, tmp) action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) # Now replace the networks # IGNORE THE "IS BATCH " CONDITION FOR NOW saved_critic_net = CriticNet(self.observation_space_size, self.action_space_size) saved_actor_net = ActorNet(self.observation_space_size, self.action_space_size) # Load in the saved graphs critic_file = os.path.join(path, "critic_net.ckpt") saved_critic_net.restore(critic_file) actor_file = os.path.join(path, "actor_net.ckpt") saved_actor_net.restore(actor_file) self.critic_net = saved_critic_net self.actor_net = saved_actor_net def add_data_fetch(self, df): self.data_fetch = df def get_name(self): return 'DDPG' + super().get_name() def act(self, state): state = self._np_shaping(state, True) result = self.actor_net.evaluate_actor(state).astype(float) self.data_fetch.set_actors_action(result[0].tolist()) return result def observe(self, episode): episode['obs'] = self._np_shaping(episode['obs'], True) episode['action'] = self._np_shaping(episode['action'], False) episode['obs2'] = self._np_shaping(episode['obs2'], True) self.add_experience(episode) def add_experience(self, episode): self.replay_memory.append(episode) self.time_step += 1 if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE: self.replay_memory.popleft() if len(self.replay_memory) > type(self).BATCH_SIZE: res = self.train() return res else: return None def minibatches(self): batch = random.sample(self.replay_memory, type(self).BATCH_SIZE) # state t state = self._np_shaping(np.array([item['obs'] for item in batch]), True) # action action = self._np_shaping(np.array([item['action'] for item in batch]), False) # reward reward = np.array([item['reward'] for item in batch]) # state t+1 state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]), True) # doneA done = np.array([item['done'] for item in batch]) return state, action, reward, state_2, done def train(self): # sample a random minibatch of N transitions from R state, action, reward, state_2, done = self.minibatches() actual_batch_size = len(state) target_action = self.actor_net.evaluate_target_actor(state) # Q'(s_i+1,a_i+1) q_t = self.critic_net.evaluate_target_critic(state_2, target_action) y = [] for i in range(0, actual_batch_size): if done[i]: y.append(reward[i]) else: y.append(reward[i] + type(self).GAMMA * q_t[i][0]) # q_t+1 instead of q_t y = np.reshape(np.array(y), [len(y), 1]) # Update critic by minimizing the loss self.critic_net.train_critic(state, action, y) # Update actor proportional to the gradients: # action_for_delQ = self.act(state) # was self.evaluate_actor instead of self.act action_for_delQ = self.actor_net.evaluate_actor( state) # dont need wolp action if self.is_grad_inverter: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ) # /BATCH_SIZE del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ) else: del_Q_a = self.critic_net.compute_delQ_a( state, action_for_delQ)[0] # /BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(state, del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()
class DDPG: """ Deep Deterministic Policy Gradient Algorithm""" def __init__(self,env, is_batch_norm): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max,action_min] self.grad_inv = grad_inverter(action_bounds) def evaluate_actor(self, state_t): return self.actor_net.evaluate_actor(state_t) def add_experience(self, observation_1, observation_2, action, reward, done): self.observation_1 = observation_1 self.observation_2 = observation_2 self.action = action self.reward = reward self.done = done self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done)) self.time_step = self.time_step + 1 if(len(self.replay_memory)>REPLAY_MEMORY_SIZE): self.replay_memory.popleft() def minibatches(self): batch = random.sample(self.replay_memory, BATCH_SIZE) #state t self.state_t_batch = [item[0] for item in batch] self.state_t_batch = np.array(self.state_t_batch) #state t+1 self.state_t_1_batch = [item[1] for item in batch] self.state_t_1_batch = np.array( self.state_t_1_batch) self.action_batch = [item[2] for item in batch] self.action_batch = np.array(self.action_batch) self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions]) self.reward_batch = [item[3] for item in batch] self.reward_batch = np.array(self.reward_batch) self.done_batch = [item[4] for item in batch] self.done_batch = np.array(self.done_batch) def train(self): #sample a random minibatch of N transitions from R self.minibatches() self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch) #Q'(s_i+1,a_i+1) q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) self.y_i_batch=[] for i in range(0,BATCH_SIZE): if self.done_batch[i]: self.y_i_batch.append(self.reward_batch[i]) else: self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0]) self.y_i_batch=np.array(self.y_i_batch) self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1]) # Update critic by minimizing the loss self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch) # Update actor proportional to the gradients: action_for_delQ = self.evaluate_actor(self.state_t_batch) if is_grad_inverter: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) else: self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)[0]#/BATCH_SIZE # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters: self.actor_net.train_actor(self.state_t_batch,self.del_Q_a) # Update target Critic and actor network self.critic_net.update_target_critic() self.actor_net.update_target_actor()
class RDPG: """Recurrent Policy Gradient Algorithm""" def __init__(self, env, N_STATES, N_ACTIONS, STEPS, BATCH_SIZE): self.N_STATES = N_STATES self.N_ACTIONS = N_ACTIONS self.STEPS = STEPS self.BATCH_SIZE = BATCH_SIZE #mini batch size self.critic_net = CriticNet(self.N_STATES, self.N_ACTIONS, self.STEPS, self.BATCH_SIZE) self.actor_net = ActorNet(self.N_STATES, self.N_ACTIONS, self.STEPS, self.BATCH_SIZE) self.R = [] def evaluate_actor(self, state, t): #converting state to a 3D tensor to feed into lstms if t == 0: self.state_matrix = np.zeros( [self.BATCH_SIZE, self.STEPS, self.N_STATES]) self.state_matrix[0, t, :] = state else: self.state_matrix[0, t, :] = state # print self.state_matrix # raw_input('Enter to continue') return self.actor_net.evaluate_actor(self.state_matrix) def add_to_replay(self, h_t, i): ##STORE THE SEQUENCE (o_1,a_1,r_1,....,o_t,a_t,r_t) in R self.h_t = h_t self.R.append(h_t) if (len(self.R) > BUFFER_SIZE): self.R.pop(0) def sample_mini_batches(self): self.indices = np.random.randint(1, len(self.R), size=(1, self.BATCH_SIZE)) self.R_mini_batch = [None] * self.BATCH_SIZE for i in range(0, len(self.indices[0, :])): self.R_mini_batch[i] = self.R[self.indices[0][i]] #reward_t (batchsize x timestep) self.r_n_tl = [None] * self.BATCH_SIZE for i in range(0, len(self.r_n_tl)): self.r_n_tl[i] = self.R_mini_batch[i][:, -1] self.r_n_t = np.zeros([self.BATCH_SIZE, self.STEPS]) for i in range(0, self.BATCH_SIZE): self.r_n_t[i, 0:len(self.r_n_tl[i])] = self.r_n_tl[i] #observation list (batchsize x timestep) self.o_n_tl = [None] * self.BATCH_SIZE for i in range(0, len(self.o_n_tl)): self.o_n_tl[i] = self.R_mini_batch[i][:, 0:self.N_STATES] self.o_n_t = np.zeros([self.BATCH_SIZE, self.STEPS, self.N_STATES]) for i in range(0, self.BATCH_SIZE): self.o_n_t[i, 0:len(self.o_n_tl[i]), :] = self.o_n_tl[i] #action list: #observation list (batchsize x timestep) self.a_n_tl = [None] * self.BATCH_SIZE for i in range(0, len(self.a_n_tl)): self.a_n_tl[i] = self.R_mini_batch[i][:, self.N_STATES:self.N_STATES + self.N_ACTIONS] self.a_n_t = np.zeros([self.BATCH_SIZE, self.STEPS, self.N_ACTIONS]) for i in range(0, self.BATCH_SIZE): self.a_n_t[i, 0:len(self.a_n_tl[i]), :] = self.a_n_tl[i] def train(self): self.sample_mini_batches() #Action at h_t+1: self.t_a_ht1 = self.actor_net.evaluate_target_actor(self.o_n_t) #State Action value at h_t+1: self.t_qht1 = self.critic_net.evaluate_target_critic( self.o_n_t, self.t_a_ht1) self.check = self.t_qht1 ##COMPUTE TARGET VALUES FOR EACH SAMPLE EPISODE (y_1,y_2,....y_t) USING THE RECURRENT TARGET NETWORKS self.y_n_t = [] self.r_n_t = np.reshape(self.r_n_t, [self.BATCH_SIZE, self.STEPS, 1]) for i in range(0, self.STEPS): if (i == self.STEPS - 1): self.y_n_t.append(self.r_n_t[:, i]) else: self.y_n_t.append(self.r_n_t[:, i, :] + GAMMA * self.t_qht1[:, i + 1, :]) self.y_n_t = np.vstack(self.y_n_t) self.y_n_t = self.y_n_t.T #(batchsize x timestep) self.y_n_t = np.reshape(self.y_n_t, [ self.BATCH_SIZE, self.STEPS, 1 ]) #reshape y_n_t to have shape (batchsize,timestep,no.dimensions) ##COMPUTE CRITIC UPDATE (USING BPTT) self.critic_net.train_critic(self.o_n_t, self.a_n_t, self.y_n_t) #action for computing critic gradient self.a_ht = self.actor_net.evaluate_actor_batch( self.o_n_t) #returns output as 3d array #critic gradient with respect to action delQ/dela self.del_Q_a = self.critic_net.compute_critic_gradient( self.o_n_t, self.a_ht) ##COMPUTE ACTOR UPDATE (USING BPTT) self.actor_net.train_actor(self.o_n_t, self.del_Q_a) ##Update the target networks self.critic_net.update_target_critic() self.actor_net.update_target_actor()