Ejemplo n.º 1
0
    def __init__(self,
                 input_dim,
                 num_actions,
                 network_params=None,
                 set_device=None,
                 gradient_clipping_norm=None,
                 reward_to_go=True,
                 learning_rate=0.01,
                 seed=1364):
        self.seed = seed
        # Training parameters
        self.gamma = 0.99
        self.total_steps_so_far = 0
        self.save_model_frequency = 100
        self.learning_rate = learning_rate
        self.latest_learning_rate = learning_rate
        self.gradient_clipping_norm = gradient_clipping_norm

        # Experience Replay Memory
        self.memory_size = 40000
        self.replay_memory = deque([], maxlen=self.memory_size)

        # ----------------------------------------
        # Make the algorithm outputs reproducible
        make_deterministic(seed)
        # ----------------------------------------

        # if gpu is to be used
        if set_device is None:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
            # self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = torch.device(set_device)

        # Policy gradient parameters
        self.episode_return_batch = torch.Tensor()
        self.actions_batch = torch.Tensor()
        self.log_probs_batch = torch.Tensor()
        self.reward_to_go = reward_to_go
        self.batch_size = 4096  # Batch size for Policy Gradient should be large to reduce variance

        # (Explanation of the network_params in networks/network_builder.py)
        if network_params is None:
            network_params = {
                'input_dim': input_dim,
                'conv_layers': [(3, 16, 5, 2), (16, 32, 5, 2), (32, 32, 5, 2)],
                'dense_layers': [num_actions],
                'conv_bn': True,
                'activation': 'relu'
            }
        self.policy_net = CreateNet(network_params).to(self.device)

        # self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.learning_rate)
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate,
                                    eps=1e-4)
Ejemplo n.º 2
0
    def run_single_episode(self, env, episode=None):
        # Make each episode deterministic based on the total_iteration_number
        make_deterministic(self.total_steps_so_far, env.env)

        finished = False
        episode_rewards = []
        log_probs = []

        # Create the first state of the episode
        state_1 = env.get_state(episode_start=True)

        while not finished:
            action, log_prob = self.get_action_and_log_prob(state_1)
            # Take the selected action in the environment
            _, reward, finished, _ = env.env.step(action)

            episode_rewards.append(reward)
            log_probs.append(log_prob)

            # If not finished, set the current_state as previous state
            if not finished:
                state_1 = env.get_state()

            # One single training iteration is passed
            self.total_steps_so_far += 1

            # If the agent has received a satisfactory episode reward, stop it.
            if sum(episode_rewards) >= env.score_required_to_win:
                finished = True

        ep_len = len(episode_rewards)

        # Computing episode return for all time points in the episode (G_t)
        # input: vector [x0, x1, x2], output: [x0 + discount * x1 + (discount ^ 2) * x2,
        #                                      x1 + discount * x2,
        #                                      x2]
        if self.reward_to_go:
            episode_return = torch.tensor([
                sum(episode_rewards[i:] * (self.gamma**np.arange(ep_len - i)))
                for i in range(ep_len)
            ])
        else:
            episode_return = torch.ones(ep_len) * sum(episode_rewards)

        self.episode_return_batch = torch.cat(
            [self.episode_return_batch, episode_return])
        self.log_probs_batch = torch.cat(
            [self.log_probs_batch, torch.cat(log_probs)])

        # Policy Network optimisation:
        # ----------------------------
        if len(self.episode_return_batch) >= self.batch_size:
            _ = self.learning_step()

        # Return the total rewards collected within this single episode run
        return episode_rewards
Ejemplo n.º 3
0
Archivo: dqn.py Proyecto: monaj07/ml
    def run_single_episode(self, env, episode):
        # Make each episode deterministic based on the total_iteration_number
        make_deterministic(self.total_steps_so_far, env.env)

        finished = False
        episode_rewards = []
        episode_losses = []

        # Create the first state of the episode
        state_1 = env.get_state(episode_start=True)

        while not finished:
            action_1 = self.get_action(env, state_1)
            # Take the selected action in the environment
            s2, reward_1, finished, _ = env.env.step(action_1)

            # when episode is finished, state_2 does not matter,
            # and won't contribute to the optimisation
            # (because state_1 was the last state of the episode)
            state_2 = (0 * state_1) if finished else env.get_state()

            # Add the current transition (s, a, r, s', done) to the replay memory
            self.add_experience_to_replay_memory(state_1, action_1, reward_1,
                                                 state_2, finished)

            # Policy Network optimisation:
            # ----------------------------
            # If there are enough sample transitions inside the replay_memory,
            # then we can start training our policy network using them;
            # Otherwise we move on to the next state of the episode.
            if len(self.replay_memory) >= self.batch_size:
                # Take a random sample minibatch from the replay memory
                minibatch = self.sample_from_replay_memory(self.batch_size)
                # Compute the TD loss over the minibatch
                loss = self.learning_step(minibatch)
                # Track the value of loss (for debugging purpose)
                episode_losses.append(loss.item())

            # Go to the next step of the episode
            state_1 = state_2
            # Add up the rewards collected during this episode
            episode_rewards.append(reward_1)
            # One single training iteration is passed
            self.total_steps_so_far += 1

            # If the agent has received a satisfactory episode reward, stop it.
            if sum(episode_rewards) >= env.score_required_to_win:
                finished = True

        if (episode % self.target_network_update) == 0:
            # Update the target network with the latest policy network parameters
            self.target_net.load_state_dict(self.policy_net.state_dict())

        # Return the total rewards collected within this single episode run
        return episode_rewards
Ejemplo n.º 4
0
    def __init__(self, epsilon_decay=0.005, seed=1364):

        # Epsilon parameters
        self.current_epsilon = 0.99
        self.epsilon_start = 0.99
        self.epsilon_end = 0.05
        self.epsilon_decay = epsilon_decay

        # ----------------------------------------
        # Make the algorithm outputs reproducible
        make_deterministic(seed)
Ejemplo n.º 5
0
    def __init__(self, seed=1364):

        # Define the environment
        self.env = gym.make('CartPole-v0').unwrapped
        # ----------------------------------------
        # Make the algorithm outputs reproducible
        make_deterministic(seed, self.env)
        # ----------------------------------------
        self.env.reset()

        # Get number of actions from gym action space
        self.num_actions = self.env.action_space.n
        # Get the space size
        self.input_dim = self.env.state.size
        self.score_required_to_win = 200
        self.average_score_required_to_win = self.env.spec.reward_threshold
Ejemplo n.º 6
0
    def __init__(self, seed=1364):

        # Define the environment
        self.env = gym.make('CartPole-v0').unwrapped
        # ----------------------------------------
        # Make the algorithm outputs reproducible
        make_deterministic(seed, self.env)
        # ----------------------------------------
        self.env.reset()

        # Get number of actions from gym action space
        self.num_actions = self.env.action_space.n

        # Get screen size so that we can initialize Q-network layers correctly based on shape
        # returned from AI gym. Typical dimensions at this point are close to 3x40x90
        # which is the result of a cropped and down-scaled render buffer in get_screen()
        # the output of get_screen is a torch frame of shape (B, C, H, W)
        _, _, screen_height, screen_width = self.get_screen().shape
        self.input_dim = (screen_height, screen_width)
        self.score_required_to_win = 200
        self.average_score_required_to_win = self.env.spec.reward_threshold
Ejemplo n.º 7
0
    def run_single_episode(self,
                           env,
                           episode,
                           number_of_learning_iterations_in_one_step=1):
        # Make each episode deterministic based on the total_iteration_number
        make_deterministic(self.total_steps_so_far, env.env)

        finished = False
        episode_rewards = []
        episode_losses = []

        # Create the first state of the episode
        state_1 = env.get_state(episode_start=True)

        while not finished:
            env.env.render(mode='rgb_array')
            action_1 = self.get_action(env, state_1)
            # Take the selected action in the environment
            s2, reward_1, finished, _ = env.env.step(action_1)

            # when episode is finished, state_2 does not matter,
            # and won't contribute to the optimisation
            # (because state_1 was the last state of the episode)
            state_2 = (0 * state_1) if finished else env.get_state()

            # Add the current transition (s, a, r, s', done) to the replay memory
            self.add_experience_to_replay_memory(state_1, action_1, reward_1,
                                                 state_2, finished)

            # Policy Network optimisation:
            # ----------------------------
            # If there are enough sample transitions inside the replay_memory,
            # then we can start training our policy network using them;
            # Otherwise we move on to the next state of the episode.
            if len(self.replay_memory) >= self.batch_size:
                if self.total_steps_so_far % self.steps_between_learning_steps == 0:
                    for _ in range(number_of_learning_iterations_in_one_step):
                        # Take a random sample minibatch from the replay memory
                        minibatch = self.sample_from_replay_memory(
                            self.batch_size)

                        # Compute the TD loss over the minibatch
                        _, _ = self.learning_step(minibatch)

                        # Track the value of loss (for debugging purpose)
                        # episode_losses.append(loss.item())

                        # Update the target networks (polyac averaging)
                        self.soft_update_target_networks()

            # Go to the next step of the episode
            state_1 = state_2
            # Add up the rewards collected during this episode
            episode_rewards.append(reward_1)
            # One single training iteration is passed
            self.total_steps_so_far += 1

            # If the agent has received a satisfactory episode reward, stop it.
            if sum(episode_rewards) >= env.score_required_to_win:
                finished = True

            # If the episode takes longer than 'max_episode_length', terminate it.
            if len(episode_rewards) > self.max_episode_length:
                break

            # print(f"episode: {episode}, reward: {reward_1}, action_1: {action_1}")

        # Return the total rewards collected within this single episode run
        return episode_rewards
Ejemplo n.º 8
0
    def __init__(self,
                 input_dim,
                 action_dimension,
                 set_device=None,
                 gradient_clipping_norm=None,
                 learning_rate_actor=0.01,
                 learning_rate_critic=0.01,
                 actor_noise_scale=0.1,
                 steps_between_learning_steps=1,
                 max_episode_length=2000,
                 polyac=0.99,
                 seed=1364):
        self.seed = seed
        # Training parameters
        self.gamma = 0.99
        self.batch_size = 256
        self.target_network_update = 10
        self.total_steps_so_far = 0
        self.max_episode_length = max_episode_length
        self.save_model_frequency = 100
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.gradient_clipping_norm = gradient_clipping_norm
        self.polyac = polyac
        self.steps_between_learning_steps = steps_between_learning_steps

        # Explorer
        self.actor_noise_scale = actor_noise_scale
        self.noise = OU_Noise(action_dimension, seed, 0, 0.15, 0.25)
        self.noise.reset()

        # Experience Replay Memory
        self.memory_size = 1000000
        self.replay_memory = deque([], maxlen=self.memory_size)

        # ----------------------------------------
        # Make the algorithm outputs reproducible
        make_deterministic(seed)
        # ----------------------------------------

        # if gpu is to be used
        if set_device is None:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
            # self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = torch.device(set_device)

        # network instantiation
        self.actor_net = ActorDDPG(input_dim, action_dimension).to(self.device)
        self.critic_net = CriticDDPG(input_dim,
                                     action_dimension).to(self.device)
        self.actor_net_target = ActorDDPG(input_dim,
                                          action_dimension).to(self.device)
        self.critic_net_target = CriticDDPG(input_dim,
                                            action_dimension).to(self.device)

        self.actor_net_target.load_state_dict(self.actor_net.state_dict())
        self.actor_net_target.eval()
        self.critic_net_target.load_state_dict(self.critic_net.state_dict())
        self.critic_net_target.eval()

        # self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.learning_rate)
        self.optimizer_actor = optim.Adam(self.actor_net.parameters(),
                                          lr=self.learning_rate_actor,
                                          eps=1e-4)
        self.optimizer_critic = optim.Adam(self.critic_net.parameters(),
                                           lr=self.learning_rate_critic,
                                           eps=1e-4)
Ejemplo n.º 9
0
Archivo: dqn.py Proyecto: monaj07/ml
    def __init__(self,
                 input_dim,
                 num_actions,
                 network_params=None,
                 explorer=None,
                 set_device=None,
                 gradient_clipping_norm=None,
                 learning_rate=0.01,
                 double_dqn=False,
                 seed=1364):
        self.seed = seed
        # Training parameters
        self.gamma = 0.99
        self.batch_size = 256
        self.target_network_update = 10
        self.total_steps_so_far = 0
        self.save_model_frequency = 100
        self.learning_rate = learning_rate
        self.latest_learning_rate = learning_rate
        self.gradient_clipping_norm = gradient_clipping_norm

        # Explorer
        if explorer is None:
            self.explorer = ActionExplorer(epsilon_decay=0.005, seed=seed)
        else:
            self.explorer = explorer

        # Experience Replay Memory
        self.memory_size = 40000
        self.replay_memory = deque([], maxlen=self.memory_size)

        # ----------------------------------------
        # Make the algorithm outputs reproducible
        make_deterministic(seed)
        # ----------------------------------------

        # if gpu is to be used
        if set_device is None:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
            # self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = torch.device(set_device)

        # Alternative DQN training
        self.double_dqn = double_dqn

        # Q-network instantiation
        # (Explanation of the network_params in networks/network_builder.py)
        if network_params is None:
            network_params = {
                'input_dim': input_dim,
                'conv_layers': [(3, 16, 5, 2), (16, 32, 5, 2), (32, 32, 5, 2)],
                'dense_layers': [num_actions],
                'conv_bn': True,
                'activation': 'relu'
            }
        self.policy_net = CreateNet(network_params).to(self.device)
        self.target_net = CreateNet(network_params).to(self.device)

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        # self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.learning_rate)
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate,
                                    eps=1e-4)