コード例 #1
0
ファイル: meta_actor_rnn.py プロジェクト: zwfightzw/MLM
                                          output_size=TASK_CONFIG_DIM)
model = meta_actor(meta_actor_input_dim, ACTION_DIM)
config_network.cuda()
model.cuda()

optimizer_meta_actor = Adam(model.parameters(), lr=0.001)
optimizer_config_network = Adam(config_network.parameters(), lr=0.001)

for t in range(100000):

    ByteTensor = pt.cuda.ByteTensor if use_cuda else pt.ByteTensor
    FloatTensor = pt.cuda.FloatTensor if use_cuda else pt.FloatTensor

    random_position = np.random.randint(low=length_lstm,
                                        high=min(
                                            memory.__len__(),
                                            n_episode * n_agents * max_steps))
    memory_info = memory.get_item(random_position, length_lstm)
    batch = Experience(*zip(*memory_info))
    state_batch = Variable(pt.stack(batch.states).type(FloatTensor))
    action_batch = Variable(pt.stack(batch.actions).type(FloatTensor))

    for i in range(n_agents):

        optimizer_meta_actor.zero_grad()
        whole_state = state_batch[0:length_lstm - 1,
                                  i, :].view(length_lstm - 1, 22)
        whole_action = action_batch[0:length_lstm - 1, i, :].view(
            length_lstm - 1, 2) / 4
        final_state = state_batch[length_lstm - 1, i, :]
        final_action = action_batch[length_lstm - 1, i, :]
コード例 #2
0
class DeepQ_agent:
    """
    Represents the DQN agent.
    """
    def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95):
        """
        Creates a DQN agent.

        :param env: game environment.
        :type env: Class Snake_Env().
        :param hidden_units: number of neurons in each layer.
        :type hidden_units: tupple with dimension (1, 3).
        :param network_LR: learning rate of the action-value neural network.
        :type network_LR: float.
        :param batch_size: size of the minibatch taken from the replay buffer.
        :type batch_size: int.
        :param update_every: number of iterations for updating the target qnetwork. 
        :type update_every: int
        :param gamma: discount factor.
        :type gamma: float.
        """
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma          
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)   
        self.ACTION_SIZE = env.ACTION_SPACE           
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every
       
        self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)
        
        self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)

        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) 

        #Temp variable
        self.t = 0


    def learn(self):
        """
        Learn from memorized experience.
        """
        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE)
            
            #Calculating action-values using local network
            target = self.qnetwork_local.predict(states, self.BATCH_SIZE)
            
            #Future action-values using target network
            target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE)
            
            #Future action-values using local network
            target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE)
        
            max_action_values = np.argmax(target_next, axis=1)   #action selection
            
            for i in range(self.BATCH_SIZE):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + self.GAMMA*target_val[i][max_action_values[i]]   #action evaluation
            
            self.qnetwork_local.train(states, target, batch_size = self.BATCH_SIZE)

            if self.t == self.UPDATE_EVERY:
                self.update_target_weights()
                self.t = 0
            else:
                self.t += 1


    def act(self, state, epsilon=0.0):
        """
        Chooses an action using an epsilon-greedy policy.
        
        :param state: current state.
        :type state: NumPy array with dimension (1, 18).
        :param epsilon: epsilon used in epsilon-greedy policy.
        :type epsilon: float
        :return action: action chosen by the agent.
        :rtype: int
        """    
        state = state.reshape((1,)+state.shape)
        action_values = self.qnetwork_local.predict(state)    #returns a vector of size = self.ACTION_SIZE
        if random() > epsilon:
            action = np.argmax(action_values)                 #choose best action - Exploitation
        else:
            action = randint(0, self.ACTION_SIZE-1)           #choose random action - Exploration
        return action


    def add_experience(self, state, action, reward, next_state, done):
        """
        Add experience to agent's memory.
        """
        self.memory.add(state, action, reward, next_state, done)

    
    def update_target_weights(self):
        """
        Updates values of the Target network.
        """
        self.qnetwork_target.model.set_weights(self.qnetwork_local.model.get_weights())
コード例 #3
0
class DeepQ_agent:
    def __init__(self,
                 env,
                 hidden_units=None,
                 network_LR=0.001,
                 batch_size=64,
                 update_every=4,
                 gamma=1.0):
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)  #this is pythonic

        self.nA = env.ACTION_SPACE  #number of actions agent can perform
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every

        #let's give it some brains
        self.qnetwork_local = QNetwork(input_shape=self.env.STATE_SPACE,
                                       hidden_units=self.HIDDEN_UNITS,
                                       output_size=self.nA,
                                       learning_rate=self.NETWORK_LR)
        print(self.qnetwork_local.model.summary())

        #I call the target network as the PC
        # Where our agent stores all the concrete and important stuff
        self.qnetwork_target = QNetwork(input_shape=self.env.STATE_SPACE,
                                        hidden_units=self.HIDDEN_UNITS,
                                        output_size=self.nA,
                                        learning_rate=self.NETWORK_LR)

        #and the memory of course
        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE)

        #handy temp variable
        self.t = 0

#----------------------Learn from experience-----------------------------------#

    def learn(self):
        '''
            hell yeah   
        '''

        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(
                self.env.STATE_SPACE)

            #calculating action-values using local network
            target = self.qnetwork_local.predict(states, self.BATCH_SIZE)

            #future action-values using target network
            target_val = self.qnetwork_target.predict(next_states,
                                                      self.BATCH_SIZE)

            #future action-values using local network
            target_next = self.qnetwork_local.predict(next_states,
                                                      self.BATCH_SIZE)

            #The main point of Double DQN is selection of action from local network
            #while the update si from target network
            max_action_values = np.argmax(target_next,
                                          axis=1)  #action selection

            for i in range(self.BATCH_SIZE):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][
                        actions[i]] = rewards[i] + self.GAMMA * target_val[i][
                            max_action_values[i]]  #action evaluation

            self.qnetwork_local.train(states,
                                      target,
                                      batch_size=self.BATCH_SIZE)

            if self.t == self.UPDATE_EVERY:
                self.update_target_weights()
                self.t = 0
            else:
                self.t += 1

#-----------------------Time to act-----------------------------------------------#

    def act(self, state, epsilon=0):  #set to NO exploration by default
        state = state.reshape((1, ) + state.shape)
        action_values = self.qnetwork_local.predict(
            state)  #returns a vector of size = self.nA
        if random.random() > epsilon:
            action = np.argmax(
                action_values)  #choose best action - Exploitation
        else:
            action = random.randint(0, self.nA -
                                    1)  #choose random action - Exploration

        return action

#-----------------------------Add experience to agent's memory------------------------#

    def add_experience(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

#----------------------Updates values of Target network----------------------------#

    def update_target_weights(self):
        #well now we are doing hard update, but we can do soft update also
        self.qnetwork_target.model.set_weights(
            self.qnetwork_local.model.get_weights())

#---------------------helpful save function-------------------------------------#

    def save(self, model_num, directory):
        self.qnetwork_local.model.save(
            f'{directory}/snake_dqn_{model_num}_{time.asctime()}.h5')