Example #1
0
class Agent():
	'''This agent Interacts with the environment to learn a policy that yields the highest commulative reward.
		The agent uses the Deep Deterministic Policy Gradient algorithm'''

	def __init__(self, state_size, action_size, seed=0):
		'''Initlize the Agent.
		
		Parameters
		----------
		state_size : int
			The dimension of each state
		
		action_size : int
			The dimension of each action
		
		seed : int
			The random seed used to generate random numbers.
		'''
		self.state_size = state_size
		self.action_size = action_size
		random.seed(seed)

		#actor gives the best action for given state
		self.actor_local = Actor(state_size, action_size, seed).to(device)
		self.actor_target = Actor(state_size, action_size, seed).to(device)

		#evaluates the action
		self.critic_local = Critic(state_size, action_size, seed).to(device)
		self.critic_target = Critic(state_size, action_size, seed).to(device)

		self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LEARNING_RATE)
		self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=WEIGHT_DECAY)

		#Replay Memory
		self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

		#Noise
		self.noise = OUNoise(action_size,seed)
		self.t_step = 0

	def step(self, state, action, reward, next_state, done):
		'''Instructs the agent to take a step in the environment.

		Executes each time the agent takes a step in the environment.
		The observed (state, action, reward, next_state, done) tuple is saved in the replay buffer.
		Once enough experiences have been captured the model is trained.
		
		Parameters
		----------
		state : array_like
			The current state.
		
		action : int
			The action that was taken.

		reward : int
			The reward that was received.

		next_state : array_like
			The next state.

		done : boolean
			True if the episode is completed, else False
		'''
		self.memory.add(state, action, reward, next_state, done)
		self.t_step = (self.t_step+1)%UPDATE_EVERY
		if self.t_step == 0:
			if len(self.memory) > BATCH_SIZE:
				experiences = self.memory.sample()
				self.train_model_parameters(experiences)
	
	def get_action(self, state, epsilon=0, add_noise=True):
		'''Gets the action for the given state defined by the current policy.

		The method returns the action to take for the given state given the current policy.
		In order to explore in the continuous space noise is added to the action.
		

		Parameters
		----------
		state : array_like
			The current state.

		epsilon : float
			The epsilon value usedfor epsilon-greedy action selection.

		add_noise : boolean
			Add noise to the action to encourage exploration.

		Returns
		-------
		action : array-like
			The action to take. Each value is between -1 and 1.
		'''
		state = torch.from_numpy(state).float().unsqueeze(0).to(device)
		self.actor_local.eval()
		with torch.no_grad():
			action = self.actor_local(state).cpu().data.numpy()
		self.actor_local.train()
		if add_noise:
			action+=self.noise.sample()
		return np.clip(action,-1,1)

	def train_model_parameters(self, experiences):
		'''Update the model parameters using the given batch of experience tuples.

		The models are train via the Actor Critic paradigm.
		The next action is optained fromt he target actor.
		This is then passed to the target critic to obtain the target next state.
		The target current state is calculated via the bellman equations.
		The local critic estimates the next state and is updated accordingly.	
		The local actions predictions the next actions given the current state.
		The loss for the actor is calculated as the ...

		Parameters
		----------
		experiences : Tuple[torch.Variable]
			A name tuple of state, action, reward, next_action and done.
		'''
		states, actions, rewards, next_states, dones = experiences
		
		#Update critic
		next_actions = self.actor_target(next_states)
		Q_next_states = self.critic_target(next_states,next_actions)
		Q_states = rewards + GAMMA*Q_next_states*(1-dones)
		Q_states_estimated = self.critic_local(states,actions)
		critic_loss = F.mse_loss(Q_states_estimated, Q_states)
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()
		
		#Update actor
		actions_pred = self.actor_local(states)
		actor_loss = -self.critic_local(states,actions_pred).mean()
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()	

		self._update_model_parameters(self.critic_local, self.critic_target)     
		self._update_model_parameters(self.actor_local, self.actor_target)     

	def _update_model_parameters(self,local_network, target_network):
		'''Copy the learned local network parameters to the target network.

		This method updates the Target network with the learned network parameters.
		The target parameters are old movd TAU towards the learned local parameters.
		The is done to help redude the amount of harmful correlation by constating moving the target.
		'''
		for target_param, local_param in zip(target_network.parameters(), local_network.parameters()):
			target_param.data.copy_(TAU*local_param.data + (1-TAU) * target_param.data)
Example #2
0
        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b = replayMemory.miniAll()

            reward_b = reward_b[:, np.newaxis]

            c_pre = critic.predict(next_state_b)

            state_pre_value = reward_b + c_pre*0.6

            state_value = critic.predict(state_b)

            critic.train(state_b, state_pre_value)

            actor.train(state_b, state_value, state_pre_value, action_matrix_b)

            replayMemory.clear()
        ########################


        if done:

            summary_str = tf.Session().run(summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            print("step = ", step, "episode_reward = ", episode_reward)

            state = env.reset()
Example #3
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        
        logging.warning(action)
        return np.clip(action, 0.0000001, 7.0)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Example #4
0
File: DQN.py Project: war3gu/gykRL
        episode_reward += reward
        ##############################train######################
        if replayMemory.size() >= 128:
            state_b, action_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniBatch(
                int(64))
            next_state_b_value = actor.predict(next_state_b)
            state_b_value = actor.predict(state_b)
            length = state_b.shape[0]

            for i in range(length):
                target_next = reward_b[i]
                if not done_b[i]:
                    action_values = next_state_b_value[i]
                    target_next = (reward_b[i] + 0.7 * np.amax(action_values))
                state_b_value[i][action_b[i]] = target_next
            actor.train(state_b, state_b_value)

        if done:
            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            print("step = ", step, "episode_reward = ", episode_reward)

            state = env.reset()

            episode_reward = 0

            step += 1
Example #5
0
class Agent:
    def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC,
                 NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU,
                 EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF):
        self.env = env
        self.sess = sess
        self.observation_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.shape[0]
        self.REWARD_DISCOUNT = REWARD_DISCOUNT
        self.TAU = TAU
        self.BATCH_SIZE = BATCH_SIZE
        self.noise_state = np.zeros(self.action_space)
        self.EXPLORATION_STEPS = EXPLORATION_STEPS
        self.VERBOSE = VERBOSE
        self.LOG_DIR_TF = LOG_DIR_TF
        #check if action_space is symmetric
        if all(env.action_space.high == abs(env.action_space.low)):
            action_scale = env.action_space.high
        else:
            raise ActionSpaceNotSymmetricException
        self.actor = Actor(self.sess, self.observation_space,
                           self.action_space, LEARNING_RATE_ACTOR, NET_SIZE,
                           TAU, action_scale)
        self.critic = Critic(self.sess, self.observation_space,
                             self.action_space, LEARNING_RATE_CRITIC, NET_SIZE,
                             TAU)
        actor_network_variables = self.actor.network.get_variables()
        critic_q_net_variables = self.critic.q_net.get_variables()
        self.actor_target_update = self.actor.target_network.update_variables(
            actor_network_variables)
        self.critic_target_update = self.critic.target_q_net.update_variables(
            critic_q_net_variables)
        self.reward_pl = tf.placeholder(tf.float32, [None, 1],
                                        name='Reward_PL')
        self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL')
        self.labels = tf.where(
            self.done_pl, self.reward_pl, self.reward_pl +
            tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction))
        #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE)
        self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE,
                                          self.observation_space,
                                          self.action_space)
        self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl')
        self.reward_f = tf.add(0.0, self.log_reward_pl)
        tf.summary.scalar('reward', self.reward_f)
        init = tf.global_variables_initializer()
        self.sess.run(init)
        self.sess.run(self.actor.network.copy_to(self.actor.target_network))
        self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net))
        self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph)
        self.merged = tf.summary.merge_all()

    def select_action(self, observation, current_step):
        action = self.actor.predict(observation, self.actor.prediction)
        if current_step <= self.EXPLORATION_STEPS:
            noise = self.noise()
        else:
            noise = 0
        return action + noise

    def noise(self):
        x = self.noise_state
        dx = 0.15 * (0 - x) + 0.2 * np.random.randn(len(x))
        self.noise_state = x + dx
        return self.noise_state

    def calcError(self, observation, new_observation, reward, action):
        """
         Calculates the error that determines the usefullness of a memory.
         High errors are better for training
        Args:
         observation: the old state
         new_observation: the current state
         reward: the reward received
         action: the action that was taken
        Returns:
         error: the difference between prediction and label
        """
        prediction = self.critic.predict(observation, action,
                                         self.critic.prediction)
        label = reward + self.REWARD_DISCOUNT * self.critic.predict(
            new_observation, action, self.critic.target_prediction)
        error = abs(label - prediction)
        return error

    def summarize(self, episode, episode_reward, observation, new_observation,
                  reward, done):
        next_action = self.actor.predict(new_observation,
                                         self.actor.target_prediction)
        feed_dict = {
            self.critic.input_pl: new_observation,
            self.critic.actions_pl: next_action,
            self.reward_pl: [[reward]],
            self.done_pl: [[done]]
        }
        label = self.sess.run(self.labels, feed_dict=feed_dict)
        feed_dict[self.critic.labels_pl] = label
        #sometimes the reward is an array and sometimes a scalar
        if isinstance(episode_reward, np.ndarray):
            episode_reward = max(episode_reward)
        feed_dict[self.log_reward_pl] = episode_reward
        summary = self.sess.run(self.merged, feed_dict=feed_dict)
        self.writer.add_summary(summary, episode)

    def train_with_batch(self, current_step):
        """
         Call train_step with a sample batch from the replay memory
        Args:
         summary: boolean if the training results are to be saved in a logfile
        """
        observations, actions, rewards, new_observations, dones = self.replay_memory.sample(
        )
        #all of this requires ~3 seconds of computational time
        #improve the Q-Network
        next_actions = self.actor.predict(new_observations,
                                          self.actor.prediction)
        feed_dict = {
            self.critic.input_pl: new_observations,
            self.critic.actions_pl: next_actions,
            self.reward_pl: rewards,
            self.done_pl: dones
        }
        labels = self.sess.run(self.labels, feed_dict=feed_dict)
        self.critic.train(observations, actions, labels)
        actions = self.actor.predict(observations, self.actor.prediction)
        gradients = self.critic.get_gradients(observations, actions)
        #improve the policy with the calculated gradients
        self.actor.train(observations, gradients)
        #Update both target networks
        #requires ~1 second of time
        self.sess.run(self.actor_target_update)
        self.sess.run(self.critic_target_update)
        #Print debug information if verbose
        if current_step % 500 == 0 and self.VERBOSE:
            print("Observations: ", observations)
            print("Predicted Best-Actions: ", actions)
            print("Labels: ", labels)
            print("Gradients: ", gradients)
Example #6
0
class DDPG_Agent():
    def __init__(self, state_size, action_size, num_agents):
        """
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents in the environment
        """
        random_seed = 1

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Replay memory
        self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE,
                                random_seed)

        # Noise process
        self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

    def step(self, states, actions, rewards, next_states, dones):
        """ add an experience in the reply buffer 
        then sample randomly from that buffer to learn (reason behind the random sampling is to break 
        the correlation between sequential experiences)
        """
        # Save experience
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state """
        states = torch.from_numpy(states).float().to(device)

        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                # Populate list of actions one state at a time
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            # We add noise for exploration purposes
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ### Update critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Calculate Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # adds gradient clipping to stabilize learning
        self.critic_optimizer.step()

        ### Update actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, regular_model, target_model, tau):
        """
            regular_model: it's the most up to date model as it's the one used for trainning 
            target_model:this one is the most stable we copy the weights of the regular model to it 
            tau (float): interpolation parameter 
        """
        for target_param, regular_param in zip(target_model.parameters(),
                                               regular_model.parameters()):
            target_param.data.copy_(tau * regular_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent:
    def __init__(self):
        self.env = snake_env()
        self.state_dim = (self.env.size, self.env.size)
        self.action_dim = self.env.action_space
        self.actor = Actor(self.state_dim, self.action_dim, args.actor_lr)
        self.critic = Critic(self.state_dim, args.critic_lr)
        self.gamma = args.gamma

        if args.load_weights:
            self.actor.model.load_weights(args.load_weights)

        if args.dist_move_reward:
            self.env.set_reward(move_reward='-dist')

        # initialize video system only
        self.env.reset()


#         self.env.render()

    def MC(self, rewards, dones, next_value):
        '''
        Monte Carlo Estimation
        '''
        rewards = rewards.reshape(-1)
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.gamma * returns[t + 1] * (1 -
                                                                     dones[t])

        return returns[:-1].reshape(-1, 1)

    def advantage(self, returns, baselines):
        return returns - baselines

    def list_to_batch(self, _list):
        '''
        convert a list of single batches into a batch of len(_list)
        '''
        batch = _list[0]
        for elem in _list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def train(self, max_updates=100, batch_size=64):
        episode_reward_list = []
        episode_length_list = []
        snake_length_list = []
        actor_loss = 0
        critic_loss = 0
        for up in tqdm(range(max_updates)):
            state_list = []
            action_list = []
            reward_list = []
            done_list = []
            step_reward_list = []
            step_snake_length = []

            state = self.env.reset()

            for ba in range(batch_size):
                #                 self.env.render()

                # data collection
                probs = tf.nn.softmax(
                    self.actor.model(np.expand_dims(state, 0))[0])
                action = np.random.choice(self.action_dim, p=probs)

                next_state, reward, done, info = self.env.step(action)
                step_reward_list.append(reward)
                step_snake_length.append(info['length'])

                if done:
                    # the end of an episode
                    episode_length_list.append(len(step_reward_list))
                    episode_reward_list.append(
                        sum(step_reward_list) / len(step_reward_list))
                    snake_length_list.append(
                        sum(step_snake_length) / len(step_snake_length))

                    n_episode = len(episode_reward_list)
                    if n_episode % args.log_interval == 0:
                        print(
                            f'\nEpisode: {n_episode}, Avg Reward: {episode_reward_list[-1]}'
                        )

                    step_reward_list = []
                    next_state = self.env.reset()

                    if max(episode_reward_list) == episode_reward_list[-1]:
                        self.actor.model.save_weights(args.save_weights)

                # make single batches
                state = np.expand_dims(state, 0)
                action = np.expand_dims(action, (0, 1))
                reward = np.expand_dims(reward, (0, 1))
                done = np.expand_dims(done, (0, 1))

                state_list.append(state)
                action_list.append(action)
                reward_list.append(reward)
                done_list.append(done)

                state = next_state

            # update the batch at once
            # convert list of batches into a batch of len(list)
            states = self.list_to_batch(state_list)
            actions = self.list_to_batch(action_list)
            rewards = self.list_to_batch(reward_list)
            dones = self.list_to_batch(done_list)

            next_value = self.critic.model(np.expand_dims(state, 0))[0]
            # using state, but actually it's next_state from the end of the loop above

            returns = self.MC(rewards, dones, next_value)

            advantages = self.advantage(returns,
                                        self.critic.model.predict(states))

            actor_loss = self.actor.train(states, actions, advantages)
            critic_loss = self.critic.train(states, returns)

        # save figure
        mean_n = 100
        n_episode = len(episode_reward_list)

        episode_reward_list = [
            sum(episode_reward_list[l:l + mean_n]) / mean_n
            for l in range(0, n_episode, mean_n)
        ]
        episode_length_list = [
            sum(episode_length_list[l:l + mean_n]) / mean_n
            for l in range(0, n_episode, mean_n)
        ]
        snake_length_list = [
            sum(snake_length_list[l:l + mean_n]) / mean_n
            for l in range(0, n_episode, mean_n)
        ]

        x = np.linspace(0, n_episode, len(episode_reward_list))

        plt.plot(x, episode_reward_list, label='Mean 100-Episode Reward')
        plt.plot(x, snake_length_list, label='Mean 100-Episode Snake Length')
        plt.plot(x,
                 episode_length_list,
                 label='Mean 100-Episode Episode Length')
        plt.legend()
        plt.xlabel('Episode')
        plt.title('A2C-snake_env')
        plt.savefig(args.save_figure)
Example #8
0
        env.render()

        action, action_matrix = actor.predict(state)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state)

        state = next_state

        episode_reward += reward

        if done:
            state_b, action_matrix_b, transform_reward_b, done_b, next_state_b = replayMemory.miniAllAfterTransform()

            actor.train(state_b, transform_reward_b, action_matrix_b)

            summary_str = tf.Session().run(summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            print("step = ", step)


            state = env.reset()

            replayMemory.clear()

            episode_reward = 0

            step += 1
class Agent:
    """
    Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an Agent

        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            num_agents (int): simultaneous running agents
            random_seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise object
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, device,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use prioritized sample from buffer to learn.
        """

        # Save memory
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY

        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # Learn from memory if enough samples exist
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, states, add_noise=True):
        """
        Returns actions for given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[i, :] = action

        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices = experiences

        # update Critic
        # Get next predicted state, actions, and Q values
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute Critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # Update priorities
        delta = abs(Q_targets - Q_expected).detach().numpy()
        self.memory.update_priorities(delta, indices)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """

        for target_model_param, local_model_param in zip(
                target_model.parameters(), local_model.parameters()):
            target_model_param.data.copy_(tau * local_model_param.data +
                                          (1. - tau) * target_model_param.data)
Example #10
0
                      for r in rewards]  # make dimension match target_q_values
    # print("Rewards: ", rewards)
    # print("Target q values: ", target_q_values)
    y_t = rewards_vector + GAMMA * target_q_values
    loss = critic.model.train_on_batch([states, actions], y_t)
    losses.append(loss)
    loss_writer.writerow({
        'episode': episode,
        'avg_reward': r_t,
        'critic_loss': loss
    })  # record losses

    # Update actor
    a_for_grad = actor.model.predict(states)
    grads = critic.gradients(states, a_for_grad)
    actor.train(states, grads)

    # Update target networks
    actor.update_actor_target()
    critic.update_critic_target()

    # Gradually decrease exploration
    epsilon *= EPSILON_DECAY

    # Print to terminal
    print("Episode: ", episode)
    print("Epsilon: ", epsilon)
    # print("S_t", s_t)
    print("Defender mu_sigma (a_t): ", a_t)
    # print("Defender locations list: ", def_coords_list)
    print("Defender average coords (row, col): ", def_avg_coords)
Example #11
0
class TD3:
    def __init__(self, n_features, action_bounds):
        self.n_features = n_features
        self.action_bounds = action_bounds

        self.eval_actor_net = Actor(n_features, action_bounds)
        self.load_weights(self.eval_actor_net)
        self.eval_actor_net.train()
        self.target_actor_net = copy.deepcopy(self.eval_actor_net)
        self.target_actor_net.eval()

        self.eval_critic_net1 = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net1)
        self.eval_critic_net1.train()

        self.eval_critic_net2 = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net2)
        self.eval_critic_net2.train()

        self.target_critic_net1 = copy.deepcopy(self.eval_critic_net1)
        self.target_critic_net1.eval()
        self.target_critic_net2 = copy.deepcopy(self.eval_critic_net2)
        self.target_critic_net2.eval()

        self.memory = Memory(Config.MEMORY_CAPACITY)
        self.batch_size = Config.BATCH_SIZE
        self.tau = Config.REPLACEMENT_SOFT_TAU

        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(
            self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99))
        # itertools.chain(self.encoder.parameters(), self.decoder.parameters())
        # self.optimizer_critic = \
        #     torch.optim.Adam([{'params': self.eval_critic_net1.parameters()},
        #                       {'params': self.eval_critic_net2.parameters()}], Config.LR_CRITIC, (0.9, 0.99))
        self.optimizer_critic1 = \
            torch.optim.Adam(self.eval_critic_net1.parameters(), Config.LR_CRITIC, (0.9, 0.99))
        self.optimizer_critic2 = \
            torch.optim.Adam(self.eval_critic_net2.parameters(), Config.LR_CRITIC, (0.9, 0.99))

        self.gamma = Config.REWARD_DECAY
        self.policy_noise_clip = Config.POLICY_NOISE_CLIP
        self.policy_delay = Config.DELAY_POLICY_UPDATE_ITER
        self.learn_iter = 0

    def load_weights(self, net):
        # net.state_dict(), 得出来的名字,'layers.1.weight'
        for m in net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1)
                nn.init.constant_(m.bias, 0.1)

    def store_transition(self, s, a, r, s_):
        self.memory.store([s, a, r, s_])

    def chose_action(self, s):
        s = torch.Tensor(np.expand_dims(s, axis=0))
        action = self.eval_actor_net(s).detach().squeeze(dim=0)
        return action

    def learn(self):
        self.learn_iter += 1
        # for x in self.Actor_target.state_dict().keys():
        #     eval('self.Actor_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)')
        # for x in self.Critic_target.state_dict().keys():
        #     eval('self.Critic_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)')

        # for target_param, param in zip(net_target.parameters(), net.parameters()):
        #     target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
        # for k, v in self.eval_critic_net.state_dict().items():
        #     self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k])
        # for k, v in self.eval_actor_net.state_dict().items():
        #     self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k])

        batch_data = self.memory.sample(self.batch_size)
        s0, a0, r1, s1 = zip(*batch_data)
        s0 = torch.tensor(s0, dtype=torch.float)
        a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size,
                                                      len(self.action_bounds))
        r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1)
        s1 = torch.tensor(s1, dtype=torch.float)

        # Select action according to policy and add clipped noise

        # Input (s, a), output q
        q_s0_a0_1 = self.eval_critic_net1(s0, a0)
        q_s0_a0_2 = self.eval_critic_net2(s0, a0)
        # Input (s_, a_), output q_ for q_target
        # 得到a_
        noise = (torch.randn_like(a0) * self.policy_noise_clip * 2).clamp(
            -self.policy_noise_clip, self.policy_noise_clip)
        a1 = self.target_actor_net(s1).detach() + noise
        action_bound = self.action_bounds.expand_as(a1)
        a1[a1 < -action_bound] = -action_bound[a1 < -action_bound]
        a1[a1 > action_bound] = action_bound[a1 > action_bound]

        q_s1_a1_1 = self.target_critic_net1(s1, a1).detach()
        q_s1_a1_2 = self.target_critic_net2(s1, a1).detach()
        q_s1_a1 = torch.min(q_s1_a1_1, q_s1_a1_2)
        q_target = r1 + self.gamma * q_s1_a1

        loss_critic = nn.MSELoss()(q_s0_a0_1, q_target) + nn.MSELoss()(
            q_s0_a0_2, q_target)

        # critic 学习过程
        # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce ,
        # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确
        # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2
        self.optimizer_critic1.zero_grad()
        self.optimizer_critic2.zero_grad()
        loss_critic.backward()
        self.optimizer_critic1.step()
        self.optimizer_critic2.step()
        loss_actor = 0
        # actor 学习过程
        # https://zhuanlan.zhihu.com/p/84321382
        # Delayed policy updates
        if self.learn_iter % self.policy_delay == 0:
            actor_a = self.eval_actor_net(s0)
            critic_q = self.eval_critic_net1(s0, actor_a)
            # loss=-q=-ce(s,ae(s))更新ae   ae(s)=a   ae(s_)=a_
            # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0
            loss_actor = -torch.mean(critic_q)

            self.optimizer_actor.zero_grad()
            loss_actor.backward()
            self.optimizer_actor.step()
            # Update the frozen target models
            for param, target_param in zip(
                    self.eval_critic_net1.parameters(),
                    self.target_critic_net1.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
            for param, target_param in zip(
                    self.eval_critic_net2.parameters(),
                    self.target_critic_net2.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
            for param, target_param in zip(self.eval_actor_net.parameters(),
                                           self.target_actor_net.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

        return loss_critic, loss_actor

    def draw_curve(self, loss):
        x = np.arange(1, len(loss) + 1)
        plt.title("cost curve")
        plt.xlabel("train step")
        plt.ylabel("cost")
        plt.plot(x, loss)
        plt.show()
Example #12
0
class Agent:
    def __init__(self, env, sess):
        # Environment
        self.n_state = env.observation_space.shape[0]
        self.n_action = env.action_space.shape[0]

        # Neural Networks
        self.sess = sess
        self.actor = Actor(self.sess, self.n_state, self.n_action)
        self.critic = Critic(self.sess, self.n_state, self.n_action)

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(BUFFER_SIZE)
        # Ornstein-Uhlenbeck Noise
        self.exploration_noise = OUNoise(self.n_action)

    def noise_action(self, state):
        '''Get action with noise'''
        return self.action(state) + self.exploration_noise.noise()

    def action(self, state):
        '''Get action from online actor'''
        return self.actor.action(state)

    def train(self):
        '''Train Networks'''
        # Draw sample from Replay Buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([d[0] for d in minibatch])
        action_batch = np.asarray([d[1] for d in minibatch])
        reward_batch = np.asarray([d[2] for d in minibatch])
        next_state_batch = np.asarray([d[3] for d in minibatch])
        done_batch = np.asarray([d[4] for d in minibatch])

        # Train Critic
        next_action_batch = self.actor.target_actions(next_state_batch)
        target_q_value_batch = self.critic.target_q(next_state_batch,
                                                    next_action_batch)
        # q = r if done else r+gamma*target_q
        q_batch = reward_batch.reshape(
            (BATCH_SIZE, 1)) + (1. - done_batch.reshape(
                BATCH_SIZE, 1).astype(float)) * GAMMA * target_q_value_batch
        self.critic.train(q_batch, state_batch, action_batch)

        # Train Actor
        action_batch_grads = self.actor.actions(state_batch)
        q_grads_batch = self.critic.gradients(state_batch, action_batch_grads)
        self.actor.train(q_grads_batch, state_batch)

        # Slowly update Target Networks
        self.actor.update_target()
        self.critic.update_target()

    def perceive(self, state, action, reward, next_state, done):
        '''Add transition to replay buffer and train if there are sufficient amount of transitions'''
        # Add samples
        self.replay_buffer.add(state, action, reward, next_state, done)
        # Train if there are sufficient number of samples
        if self.replay_buffer.count() > REPLAY_START:
            self.train()
        # Reset the noise for next episode
        if done:
            self.exploration_noise.reset()
Example #13
0
class Actor_Critic:
    def __init__(self, n_features, action_bounds):
        self.n_features = n_features
        self.action_bounds = action_bounds

        self.eval_actor_net = Actor(n_features, action_bounds)
        self.load_weights(self.eval_actor_net)
        self.eval_actor_net.train()
        self.target_actor_net = Actor(n_features, action_bounds)
        self.target_actor_net.eval()
        self.eval_critic_net = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net)
        self.eval_critic_net.train()
        self.target_critic_net = Critic(n_features, action_bounds)
        self.target_critic_net.eval()

        self.memory = Memory(Config.MEMORY_CAPACITY)
        self.batch_size = Config.BATCH_SIZE
        self.tau = Config.REPLACEMENT_SOFT_TAU

        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99))
        self.optimizer_critic = torch.optim.Adam(self.eval_critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99))
        self.gamma = Config.REWARD_DECAY

    def load_weights(self, net):
        # net.state_dict(), 得出来的名字,'layers.1.weight'
        for m in net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1)
                nn.init.constant_(m.bias, 0.1)

    def store_transition(self, s, a, r, s_):
        self.memory.store([s, a, r, s_])

    def chose_action(self, s):
        s = torch.Tensor(np.expand_dims(s, axis=0))
        action = self.eval_actor_net(s).detach().squeeze(dim=0)
        return action

    def learn(self):
        # for x in self.Actor_target.state_dict().keys():
        #     eval('self.Actor_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)')
        # for x in self.Critic_target.state_dict().keys():
        #     eval('self.Critic_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)')

        # for target_param, param in zip(net_target.parameters(), net.parameters()):
        #     target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
        for k, v in self.eval_critic_net.state_dict().items():
            self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k])
        for k, v in self.eval_actor_net.state_dict().items():
            self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k])

        batch_data = self.memory.sample(self.batch_size)
        s0, a0, r1, s1 = zip(*batch_data)
        s0 = torch.tensor(s0, dtype=torch.float)
        a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size, len(self.action_bounds))
        r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1)
        s1 = torch.tensor(s1, dtype=torch.float)

        # Input (s, a), output q
        q_s0_a0 = self.eval_critic_net(s0, a0)
        # Input (s_, a_), output q_ for q_target
        # 得到a_
        a1 = self.target_actor_net(s1).detach()
        q_s1_a1 = self.target_critic_net(s1, a1).detach()
        q_target = r1 + self.gamma * q_s1_a1
        loss_critic = nn.MSELoss()(q_s0_a0, q_target)

        # critic 学习过程
        # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce ,
        # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确
        # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2
        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        # actor 学习过程
        # https://zhuanlan.zhihu.com/p/84321382
        actor_a = self.eval_actor_net(s0)
        critic_q = self.eval_critic_net(s0, actor_a)
        # loss=-q=-ce(s,ae(s))更新ae   ae(s)=a   ae(s_)=a_
        # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0
        loss_actor = -torch.mean(critic_q)

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()
        return loss_critic, loss_actor

    def draw_curve(self, loss):
        x = np.arange(1, len(loss)+1)
        plt.title("cost curve")
        plt.xlabel("train step")
        plt.ylabel("cost")
        plt.plot(x, loss)
        plt.show()
Example #14
0
class DDPG_Agent():
    def __init__(self, state_size, action_size, num_agents):
        """
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents in the environment
        """
        random_seed = 10.0
        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Replay memory
        self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE,
                                self.random_seed)

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Make sure the Actor Target Network has the same weight values as the Local Network
        for target, local in zip(self.actor_target.parameters(),
                                 self.actor_local.parameters()):
            target.data.copy_(local.data)

        # Critic Network (w/ Target Network)

        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)
        """
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        """

        # Make sure the Critic Target Network has the same weight values as the Local Network
        for target, local in zip(self.critic_target.parameters(),
                                 self.critic_local.parameters()):
            target.data.copy_(local.data)

        self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, noise=0.0):

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if ADD_NOISE:
            action += self.noise.sample() * noise
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        ### Used only for DDPG (use madddpg.maddpg_learn() for MADDPG)
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #15
0
File: PPO.py Project: war3gu/gykRL
def train():

    env = gym.make('LunarLander-v2')

    state = env.reset()

    actor = Actor(env.action_space, env.observation_space)

    critic = Critic(env.action_space, env.observation_space)

    actor.load()
    critic.load()

    replayMemory = ReplayMemory()

    summary_ops, summary_vars = build_summaries()

    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 1

    while True:

        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll(
            )

            reward_b = reward_b[:, np.newaxis]

            c_pre = critic.predict(next_state_b)

            state_pre_value = reward_b + c_pre * 0.7

            state_value = critic.predict(state_b)

            count = 5000 // step

            if count > 500:
                count = 500

            if count < 1:
                count = 1

            count = 10

            for _ in range(count):
                critic.train(state_b, state_pre_value)

            for _ in range(count):
                actor.train(state_b, state_value, state_pre_value,
                            action_matrix_b, prob_b)

            replayMemory.clear()
        ########################

        if done:

            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            ##print("step = ", step, "episode_reward = ", episode_reward)

            state = env.reset()

            episode_reward = 0

            step += 1

            if step % 25 == 0:
                actor.save()
                critic.save()