예제 #1
0
    def __init__(self,
                 n_states,
                 n_actions,
                 hidden_dim=90,
                 device="cpu",
                 critic_lr=5e-3,
                 actor_lr=5e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128):
        self.device = device
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
        self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.memory = ReplayBuffer(memory_capacity)
        self.batch_size = batch_size
        self.soft_tau = soft_tau
        self.gamma = gamma
	def __init__(self, env):
		self.env  = env
		self.num_robots = env.num_robots

		self.learning_rate = 0.0001
		self.epsilon = .9
		self.epsilon_decay = .99995
		self.eps_counter = 0
		self.gamma = .90
		self.tau   = .01


		self.buffer_size = 1000000
		self.batch_size = 512

		self.hyper_parameters_lambda3 = 0.2
		self.hyper_parameters_eps = 0.2
		self.hyper_parameters_eps_d = 0.4

		self.demo_size = 1000
		self.time_str = time.strftime("%Y%m%d-%H%M%S")
		self.parent_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/DRLbasedController/weights"
		self.save_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/results/trained_weights/exp2/HLERnoisy/" 
		self.path = os.path.join(self.parent_dir, self.time_str)
		os.mkdir(self.path)

        # Replay buffer
		self.memory = deque(maxlen=1000000)
		# Replay Buffer
		self.replay_buffer = ExperienceReplayBuffer(total_timesteps=5000*256, type_buffer="HER")
		# File name
		self.file_name = "reward_{}_{}_{}".format(self.time_str, self.num_robots, self.replay_buffer.type_buffer)
		# Hidden Layer list
		self.hid_list = [512, 512, 512]
		# ===================================================================== #
		#                               Actor Model                             #
		# Chain rule: find the gradient of chaging the actor network params in  #
		# getting closest to the final value network predictions, i.e. de/dA    #
		# Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act #
		# ===================================================================== #

		self.actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list)
		self.target_actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list)
		self.actor_optim = optim.Adam(self.actor_model.parameters(), lr=self.learning_rate)

		# ===================================================================== #
		#                              Critic Model                             #
		# ===================================================================== #

		self.critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list)
		self.target_critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list)
		self.critic_optim = optim.Adam(self.critic_model.parameters(), lr=self.learning_rate)
		

		hard_update(self.target_actor_model, self.actor_model) # Make sure target is with the same weight
		hard_update(self.target_critic_model, self.critic_model)

		self.cuda()
예제 #3
0
 def __init__(self, env, GAMMA=0.9):
     self.env = env
     print('obs space shape: {}'.format(self.env.observation_space.shape))
     print('action space shape: {}'.format(self.env.action_space.shape))
     self.states_dim = self.env.observation_space.shape[0]
     self.action_dim = self.env.action_space.shape[0]
     print('states dim: {}\t\t actions dim: {}'.format(
         self.states_dim, self.action_dim))
     self.actor = Actor(self.states_dim, self.action_dim, lr=0.0001)
     self.critic = Critic(self.states_dim, self.action_dim, lr=0.0001)
     self.GAMMA = GAMMA
     self.RANDOM_PROB = 0.025
     self.replay_buffer = ReplayBuffer(1280)
예제 #4
0
    def __init__(self, hparams):
        '''
        Initializations
        '''
        super().__init__()
        self.hparams = hparams

        # Position of human
        source_position = torch.tensor([[self.hparams.environment.position.end.x],
                                        [self.hparams.environment.position.end.y],
                                        [self.hparams.environment.position.end.z]]).float()

        # Position of agent
        agent_position  = torch.tensor([[self.hparams.environment.position.start.x],
                                        [self.hparams.environment.position.start.y],
                                        [self.hparams.environment.position.start.z]]).float()


        # Initialize Replay buffer
        self.replay_buffer = ReplayBuffer(capacity = self.hparams.model.replay_buffer_size)


        # Initialize drone
        self.agent = Drone(start_position = agent_position,
                           goal_position = source_position,
                           velocity_factor = self.hparams.environment.agent.velocity_factor,
                           hparams = self.hparams,
                           buffer = self.replay_buffer)

        # Actor networks
        self.net = Actor(**self.hparams.model.actor)
        self.target_net = Actor(**self.hparams.model.actor)

        # Critic networks
        self.critic = Critic(**self.hparams.model.critic)
        self.target_critic = Critic(**self.hparams.model.critic)

        # Hard update
        self.target_net.load_state_dict(self.net.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.total_reward = -10000
        self.episode_steps = 0.0
        self.max_episode_steps = self.hparams.model.max_episode
        self.episode_reward = 0.0
        self.populate(self.hparams.model.replay_buffer_size)
    def __init__(self, env, env_obs, gamma=0.99, tau=0.001, lr_actor=1e-3, lr_critic=1e-3, weight_decay=0.1, batch_size=64, subpolicies=1, action_shape=2, replay_buffer_size=5000, replay_buffer_type="rb", noise=0.1, noise_decay=0.999, max_action=1, min_action=-1, teacher=False, alpha=0.1, bc=None):

        self.env = env
        self.subpolicies = subpolicies
        self.total_obs = np.sum(env_obs)
        self.weight_decay = weight_decay
        self.env_obs = env_obs
        self.max_action = max_action
        self.min_action = min_action
        self.action_shape = action_shape
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.replay_buffer_type = replay_buffer_type
        self.replay_buffer_size = replay_buffer_size
        self.init_noise = noise
        self.noise = noise
        self.noise_decay = noise_decay
        self.teacher = teacher
        self.bc = bc
        self.alpha = alpha

        self.mul = 1 if self.teacher is False else 2

        self.actors = [[Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies)] for agent in range(env.n)]
        self.actors_targets = [[Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies)] for agent in range(env.n)]
        self.critics = [Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents]
        self.critics_targets = [Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents]

        self.actors_optimizers = [[torch.optim.RMSprop(self.actors[agent][i].parameters(), lr=lr_actor, weight_decay=weight_decay) for i in range(self.subpolicies)] for agent in range(len(env.agents))]
        self.critics_optimisers = [torch.optim.RMSprop(self.critics[agent].parameters(), lr=lr_critic ,weight_decay=weight_decay) for agent in range(len(env.agents))]

        if self.subpolicies > 1:
            if self.replay_buffer_type == "rb":
                self.replay_buffers = [[ReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)]
            else:
                self.replay_buffers = [[PrioritizedReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)]
        else:
            if self.replay_buffer_type == "rb":
                self.replay_buffers = ReplayBuffer(self.replay_buffer_size)
            else:
                self.replay_buffers = [[PrioritizedReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)]
예제 #6
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # initialize state
        self.last_state = self.task.reset()
예제 #7
0
class ActorCritic:
    def __init__(self, env):
        self.env = env
        self.num_robots = env.num_robots

        self.learning_rate = 0.0001
        self.epsilon = .9
        self.epsilon_decay = .99995
        self.eps_counter = 0
        self.gamma = .90
        self.tau = .01

        self.buffer_size = 1000000
        self.batch_size = 512

        self.hyper_parameters_lambda3 = 0.2
        self.hyper_parameters_eps = 0.2
        self.hyper_parameters_eps_d = 0.4

        self.demo_size = 1000
        self.time_str = time.strftime("%Y%m%d-%H%M%S")
        self.parent_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/DRLbasedController/weights"
        self.path = os.path.join(self.parent_dir, self.time_str)
        os.mkdir(self.path)

        # Replay buffer
        self.memory = deque(maxlen=1000000)
        # Replay Buffer
        self.replay_buffer = ExperienceReplayBuffer(total_timesteps=5000 * 256,
                                                    type_buffer="HER")
        # File name
        self.file_name = "reward_{}_{}_{}".format(
            self.time_str, self.num_robots, self.replay_buffer.type_buffer)
        # Hidden Layer list
        self.hid_list = [1024, 512, 512]
        # ===================================================================== #
        #                               Actor Model                             #
        # Chain rule: find the gradient of chaging the actor network params in  #
        # getting closest to the final value network predictions, i.e. de/dA    #
        # Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act #
        # ===================================================================== #

        self.actor_model = Actor(self.env.observation_space.shape,
                                 self.env.action_space.shape, self.hid_list)
        self.target_actor_model = Actor(self.env.observation_space.shape,
                                        self.env.action_space.shape,
                                        self.hid_list)
        self.actor_optim = optim.Adam(self.actor_model.parameters(),
                                      lr=self.learning_rate)

        # ===================================================================== #
        #                              Critic Model                             #
        # ===================================================================== #

        self.critic_model = Critic(self.env.observation_space.shape,
                                   self.env.action_space.shape, 1,
                                   self.hid_list)
        self.target_critic_model = Critic(self.env.observation_space.shape,
                                          self.env.action_space.shape, 1,
                                          self.hid_list)
        self.critic_optim = optim.Adam(self.critic_model.parameters(),
                                       lr=self.learning_rate)

        hard_update(
            self.target_actor_model,
            self.actor_model)  # Make sure target is with the same weight
        hard_update(self.target_critic_model, self.critic_model)

        self.cuda()

    # ========================================================================= #
    #                               Model Training                              #
    # ========================================================================= #

    def remember(self, cur_state, action, reward, new_state, done):
        for i in range(self.num_robots):
            self.memory.append(
                [cur_state[i], action[i], reward[i], new_state[i], done[i]])

    def _train_critic_actor(self, samples):

        Loss = nn.MSELoss()

        # 1, sample
        cur_states, actions, rewards, new_states, dones, weights, batch_idxes = stack_samples(
            samples)  # PER version also checks if I need to use stack_samples
        target_actions = to_numpy(
            self.target_actor_model(to_tensor(new_states)))

        # Critic Update
        self.critic_model.zero_grad()
        Q_now = self.critic_model([cur_states, actions])
        next_Q = self.target_critic_model([new_states, target_actions])
        dones = dones.astype(bool)
        Q_target = to_tensor(rewards) + self.gamma * next_Q.reshape(
            next_Q.shape[0]) * to_tensor(1 - dones)

        td_errors = Q_target - Q_now.reshape(Q_now.shape[0])

        value_loss = Loss(Q_target, Q_now.squeeze())
        value_loss.backward()
        self.critic_optim.step()

        # Actor Update
        self.actor_model.zero_grad()
        policy_loss = -self.critic_model(
            [to_tensor(cur_states),
             self.actor_model(to_tensor(cur_states))])
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # NoisyNet noise reset
        self.actor_model.reset_noise()
        self.target_actor_model.reset_noise()

        return td_errors

    def read_Q_values(self, cur_states, actions):
        critic_values = self.critic_model.predict([cur_states, actions])
        return critic_values

    def train(self, t):
        batch_size = self.batch_size
        if self.replay_buffer.replay_buffer.__len__() < batch_size:  #per
            return

        samples = self.replay_buffer.replay_buffer.sample(
            batch_size, beta=self.replay_buffer.beta_schedule.value(t))
        (obses_t, actions, rewards, obses_tp1, dones, weights,
         batch_idxes) = samples

        self.samples = samples
        td_errors = self._train_critic_actor(samples)

        # priority updates
        #new_priorities = np.abs(td_errors) + self.replay_buffer.prioritized_replay_eps
        #self.replay_buffer.replay_buffer.update_priorities(batch_idxes, new_priorities)

    # ========================================================================= #
    #                         Target Model Updating                             #
    # ========================================================================= #

    def _update_actor_target(self):
        soft_update(self.target_actor_model, self.actor_model, self.tau)

    def _update_critic_target(self):
        soft_update(self.target_critic_model, self.critic_model, self.tau)

    def update_target(self):
        self._update_actor_target()
        self._update_critic_target()

    # ========================================================================= #
    #                              Model Predictions                            #
    # ========================================================================= #

    def act(
        self, cur_state
    ):  # this function returns action, which is predicted by the model. parameter is epsilon
        if self.eps_counter >= self.num_robots:
            self.epsilon *= self.epsilon_decay
            self.eps_counter = 0
        else:
            self.eps_counter += 1
        eps = self.epsilon
        cur_state = np.array(cur_state).reshape(1, 8)
        action = to_numpy(self.actor_model(to_tensor(cur_state))).squeeze(0)
        action = action.reshape(1, 2)
        if np.random.random() < self.epsilon:
            action[0][0] = action[0][0] + (np.random.random() - 0.5) * 0.4
            action[0][1] = action[0][1] + (np.random.random()) * 0.4
            return action, eps
        else:
            action[0][0] = action[0][0]
            action[0][1] = action[0][1]
            return action, eps

    # ========================================================================= #
    #                              save weights                                 #
    # ========================================================================= #

    def save_weight(self, num_trials, trial_len):
        torch.save(
            self.actor_model.state_dict(), self.path + '/actormodel' + '-' +
            str(num_trials) + '-' + str(trial_len) + '.pkl')
        torch.save(
            self.critic_model.state_dict(), self.path + '/criticmodel' + '-' +
            str(num_trials) + '-' + str(trial_len) + '.pkl')
        #self.actor_model.save_weights(self.path + 'actormodel' + '-' +  str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True)
        #self.critic_model.save_weights(self.path + 'criticmodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True)#("criticmodel.h5", overwrite=True)

        # ========================================================================= #

    #                              load weights                                 #
    # ========================================================================= #

    def load_weights(self, output):

        self.actor_model.load_state_dict(torch.load('{}.pkl'.format(output)))

        self.critic_model.load_state_dict(torch.load('{}.pkl'.format(output)))

    def play(self, cur_state):
        return to_numpy(self.actor_model(to_tensor(cur_state),
                                         volatile=True)).squeeze(0)

    def cuda(self):
        self.actor_model.cuda()
        self.target_actor_model.cuda()
        self.critic_model.cuda()
        self.target_critic_model.cuda()
예제 #8
0
class AgentTrainer(pl.LightningModule):
    '''
    Pytorch trainer class for Drone Reinforcement learning
    '''

    def __init__(self, hparams):
        '''
        Initializations
        '''
        super().__init__()
        self.hparams = hparams

        # Position of human
        source_position = torch.tensor([[self.hparams.environment.position.end.x],
                                        [self.hparams.environment.position.end.y],
                                        [self.hparams.environment.position.end.z]]).float()

        # Position of agent
        agent_position  = torch.tensor([[self.hparams.environment.position.start.x],
                                        [self.hparams.environment.position.start.y],
                                        [self.hparams.environment.position.start.z]]).float()


        # Initialize Replay buffer
        self.replay_buffer = ReplayBuffer(capacity = self.hparams.model.replay_buffer_size)


        # Initialize drone
        self.agent = Drone(start_position = agent_position,
                           goal_position = source_position,
                           velocity_factor = self.hparams.environment.agent.velocity_factor,
                           hparams = self.hparams,
                           buffer = self.replay_buffer)

        # Actor networks
        self.net = Actor(**self.hparams.model.actor)
        self.target_net = Actor(**self.hparams.model.actor)

        # Critic networks
        self.critic = Critic(**self.hparams.model.critic)
        self.target_critic = Critic(**self.hparams.model.critic)

        # Hard update
        self.target_net.load_state_dict(self.net.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.total_reward = -10000
        self.episode_steps = 0.0
        self.max_episode_steps = self.hparams.model.max_episode
        self.episode_reward = 0.0
        self.populate(self.hparams.model.replay_buffer_size)


    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - tau) + param.data * tau
            )

    def configure_optimizers(self):

        optimizer2 = getattr(torch.optim, self.hparams.optimizer.type)([{"params": self.net.parameters(), "lr": self.hparams.optimizer.args.lr}], **self.hparams.optimizer.args)
        optimizer = getattr(torch.optim, self.hparams.optimizer.type)(self.critic.parameters(), **self.hparams.optimizer.args, weight_decay=1e-3)

        scheduler2 = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args)
        scheduler = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args)

        return [optimizer, optimizer2], [scheduler, scheduler2]

    def dqn_mse_loss(self, batch) -> torch.Tensor:
        """
        Calculates the mse loss using a mini batch from the replay buffer
        Args:
            batch: current mini batch of replay data
        Returns:
            loss
        """
        states, actions, rewards, dones, next_states = batch

        #print(states["image"].shape, rewards.shape)
        rewards_out = rewards[:, -1]
        print(actions.shape, rewards_out.shape, rewards.shape, "shapes")
        #print(rewards.shape, actions.shape, "reward, action")
        # print(states["image"].shape)
        # state_action_values = self.net(states["image"], states["signal"]).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        action_value = self.net(next_states["image"])
        Q_value = self.critic(next_states["image"], action_value).squeeze(-1)

        # print(state_action_values)

        with torch.no_grad():


            #next_action_value = self.target_net(next_states["image"], next_states["signal"])
            #print(next_action_value.shape, "action")
            next_Q_value = self.target_critic(states["image"], actions.float()).squeeze(-1)
            # next_state_values[dones] = 0.0
            #print("Q value:", next_Q_value.shape)
            #next_action_value = next_action_value.detach()
            next_Q_value = next_Q_value.detach()

            #Q_value_actor = self.critic(next_states["image"], next_states["signal"], action_value).squeeze(-1)

        #print(next_Q_value.shape, rewards_out.shape)
        expected_state_action_values = Q_value * self.hparams.model.gamma + rewards_out
        #print(expected_state_action_values.shape, Q_value.shape)
        return {"loss": nn.MSELoss()(next_Q_value, expected_state_action_values), "policy_loss": - (Q_value).mean()}

    def populate(self, steps: int = 1000) -> None:
        '''
        Carries out several random steps through the environment to initially fill
        up the replay buffer with experiences
        '''

        for i in range(steps):
            print(i)
            self.agent.playStep(self.net, 1.0, self.get_device())

            if i % self.max_episode_steps == 0:
                self.agent.reset()

        self.agent.reset()

    def playTrajectory(self):
        '''
        Play the trajectory
        '''
        self.agent.reset()
        device = self.get_device()
        while (True):

            self.agent.playStep(self.net, 0, device)

    def training_step(self, batch, batch_idx, optimizer_idx):
        '''
        Training steps
        '''

        self.episode_steps = self.episode_steps + 1
        device = self.get_device()
        epsilon = max(self.hparams.model.min_epsilon, self.hparams.model.max_epsilon - (self.global_step + 1) / self.hparams.model.stop_decay)
        print("eps:", epsilon)

        # step through environment with agent
        reward, done = self.agent.playStep(self.target_net, epsilon, device)
        self.episode_reward += reward

        # calculates training loss
        loss = self.dqn_mse_loss(batch)
        #print(loss)
        self.log("train_loss", loss["loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True)
        self.log("policy_loss", loss["policy_loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True)

        if done:
            if self.episode_reward > self.total_reward:
                self.total_reward = self.episode_reward

            self.episode_reward = 0
            self.episode_steps = 0


        if optimizer_idx:
            loss_out = loss["policy_loss"]
        else:
            loss_out = loss["loss"]

        # Soft update of target network
        if self.global_step % self.hparams.model.sync_rate == 0:

            self.soft_update(self.target_net, self.net, self.hparams.model.tau)
            self.soft_update(self.target_critic, self.critic, self.hparams.model.tau)

            # self.target_net.load_state_dict(self.net.state_dict())
            # self.target_critic.load_state_dict(self.critic.state_dict())

        log = {
            'total_reward': torch.tensor(self.total_reward).to(device),
            'reward': torch.tensor(reward).to(device),
            'steps': torch.tensor(self.global_step).to(device)
        }
        for key in log:
            self.log(key, log[key], logger = True, prog_bar = True, on_step = True)

        if self.episode_steps > self.max_episode_steps:
            self.episode_steps = 0
            self.total_reward = self.episode_reward
            self.agent.reset()
        #print(loss_out)
        #return OrderedDict({'loss': loss, 'log': log, 'progress_bar': log})
        return loss_out


    def __dataloader(self) -> DataLoader:
        """
        Initialize the Replay Buffer dataset used for retrieving experiences
        """

        dataset = RLDataset(self.replay_buffer, self.hparams.model.sample_size)
        dataloader = DataLoader(
            dataset=dataset,
            **self.hparams.dataset.loader)

        return dataloader

    def train_dataloader(self) -> DataLoader:
        """
        Get train loader
        """

        return self.__dataloader()

    def get_device(self) -> str:
        """
        Retrieve device currently being used by minibatch
        """

        return self.device.index if self.on_gpu else 'cpu'

    def forward(self, x):

        return self.net(x)
LR_A = 0.001
LR_C = 0.01

env = gym.make('MountainCar-v0')
env = env.unwrapped

sess = tf.Session()

actor = Actor(sess,
              n_features=env.observation_space.shape[0],
              n_actions=env.action_space.n,
              learning_rate=LR_A)

critic = Critic(sess,
                n_features=env.observation_space.shape[0],
                learning_rate=LR_C)

sess.run(tf.global_variables_initializer())

for i_episode in range(1000):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        # if RENDER: env.render()
        env.render()

        a = actor.choose_acton(s)

        s_, r, done, info = env.step(a)
예제 #10
0
DISPLAY_REWARD_THRESHOLD = -90

RENDER = False  # rendering wastes time

env = gym.make('MountainCar-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

actor = Actor(epsilon=0)
critic = Critic()
Tmax = 1000
for i_episode in range(3000):

    observation = env.reset()
    action = actor.choose_action(observation)

    running_reward = 0
    critic.reset()
    count = 0
    while count < Tmax:
        count += 1
        if RENDER: env.render()

        observation_, reward, done, info = env.step(
            action)  # reward = -1 in all cases
예제 #11
0
class DDPG:
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # initialize state
        self.last_state = self.task.reset()

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, mode="train"):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if mode.lower() == "train":
            return list(action +
                        self.noise.sample())  # add some noise for exploration
        elif mode.lower() == "test":
            return list(action)
        else:
            raise AttributeError("Mode can be either train or test")

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #12
0
class ActorCriticEnv(object):
    def __init__(self, env, GAMMA=0.9):
        self.env = env
        print('obs space shape: {}'.format(self.env.observation_space.shape))
        print('action space shape: {}'.format(self.env.action_space.shape))
        self.states_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        print('states dim: {}\t\t actions dim: {}'.format(
            self.states_dim, self.action_dim))
        self.actor = Actor(self.states_dim, self.action_dim, lr=0.0001)
        self.critic = Critic(self.states_dim, self.action_dim, lr=0.0001)
        self.GAMMA = GAMMA
        self.RANDOM_PROB = 0.025
        self.replay_buffer = ReplayBuffer(1280)

    def add_state_action_to_buffer(self, state, action, resulting_state, done):
        if done:
            predicted_q_val = np.asarray([[-25000]])
        else:
            best_new_action = self.actor.get_action(
                np.asarray([resulting_state]))
            predicted_next_q = self.critic.predict_q_val(
                np.asarray([resulting_state]), best_new_action)

            true_reward = get_reward(resulting_state)
        self.replay_buffer.add(state, action, true_reward, 0, resulting_state)
        # The 0 is for "t", which I don't understand the point of.
        return

    def train_from_state_action(self, state, action, resulting_state, done):
        if done:
            predicted_q_val = np.asarray([[-25000]])
        else:
            best_new_action = self.actor.get_action(
                np.asarray([resulting_state]))
            predicted_next_q = self.critic.predict_q_val(
                np.asarray([resulting_state]), best_new_action)

            true_reward = get_reward(resulting_state)

            predicted_q_val = true_reward + self.GAMMA * predicted_next_q

        wrapped_state = np.asarray([state])
        wrapped_action = np.asarray(action)
        # wrapped_q_goal = np.asarray([[predicted_q_val]])

        # print("STATE SHAPE: {}\t\tACTION SHAPE: {}\t\tREWARD SHAPE: {}".format(wrapped_state.shape, wrapped_action.shape, wrapped_true_reward.shape))

        inputs = [wrapped_state, wrapped_action, predicted_q_val]
        # print('created inputs. Calculating action grads.')
        action_grads = self.critic.get_action_grads(*inputs)
        # print('Optimizing critic q-val prediction.')
        self.critic.optimize_q_val(*inputs)
        # print('training actor from state and grads')
        self.actor.train_from_batch(wrapped_state, action_grads)
        # print('all done training')

    # def train_from_replay_buffer(self, batch_size=64):
    #   s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch(batch_size)
    #   best_new_actions = self.actor.get_action(s2_batch)
    #   s2_predicted_q_vals = self.critic.predict_q_val(s2_batch, best_new_actions)

    def play_random_game(self, render=True):
        observation = env.reset()

        for t in range(1000):
            if render == True:
                env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print('Episode finished after {} timesteps'.format(t + 1))
                break

    def play_game_from_actor(self, render=True):
        observation = env.reset()
        for t in range(1000):
            if render == True:
                env.render()
            # action = env.action_space.sample()
            print(observation)
            action = self.actor.get_action(np.asarray([observation]))
            observation, reward, done, info = env.step(action)
            if done:
                print('Episode finished after {} timesteps'.format(t + 1))
                break

    def train_actor_critic_to_stay_still(self, render=True):
        # My reward after each one is the difference between where you are
        # and where you started.
        true_rewards = []
        observation = env.reset()
        for t in range(1000):
            if render == True:
                env.render()

            true_rewards.append(get_reward(observation))
            if random_with_prob(self.RANDOM_PROB):
                action = np.asarray([env.action_space.sample()])
            else:
                action = self.actor.get_action(np.asarray([observation]))
            new_observation, reward, done, info = env.step(action)

            self.train_from_state_action(observation, action, new_observation,
                                         done)
            observation = new_observation

            if done:
                print(
                    'Episode finished after {} timesteps. Average reward: {}'.
                    format(t + 1, np.mean(np.asarray(true_rewards))))
                break
예제 #13
0
class DDPG:
    def __init__(self,
                 n_states,
                 n_actions,
                 hidden_dim=90,
                 device="cpu",
                 critic_lr=5e-3,
                 actor_lr=5e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128):
        self.device = device
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
        self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.memory = ReplayBuffer(memory_capacity)
        self.batch_size = batch_size
        self.soft_tau = soft_tau
        self.gamma = gamma

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor(state)
        # torch.detach()用于切断反向传播
        return action.detach().cpu().numpy()[0]

    def update(self):
        if len(self.memory) < self.batch_size:
            return
        state, action, reward, next_state, done = self.memory.sample(
            self.batch_size)
        # 将所有变量转为张量
        state = torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
        # 注意critic将(s_t,a)作为输入

        actor_loss = self.critic(state, self.actor(state))
        actor_loss = -actor_loss.mean()

        next_action = self.target_actor(next_state)
        target_value = self.target_critic(next_state, next_action.detach())
        expected_value = reward + (1.0 - done) * self.gamma * target_value
        expected_value = torch.clamp(expected_value, -np.inf, np.inf)

        value = self.critic(state, action)
        critic_loss = nn.MSELoss()(value, expected_value.detach())

        #训练优化actor及critic网络
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # soft更新目标网络
        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)

    def save_model(self, path):
        torch.save(self.target_actor.state_dict(), path)

    def load_model(self, path):
        self.actor.load_state_dict(torch.load(path))

    def buffer_model_save(self, saved_dir):
        self.memory.save(saved_dir)
        torch.save(self.critic.state_dict(),
                   saved_dir + "/critic_checkpoint.pth")
        torch.save(self.actor.state_dict(),
                   saved_dir + "/actor_checkpoint.pth")
        torch.save(self.target_critic.state_dict(),
                   saved_dir + "/target_critic_checkpoint.pth")
        torch.save(self.target_actor.state_dict(),
                   saved_dir + "/target_actor_checkpoint.pth")

    def buffer_model_load(self, saved_dir):
        if not os.path.exists(saved_dir):  # 检测是否存在文件夹
            os.makedirs(saved_dir)
            return
        self.memory.load(saved_dir)

        self.critic.load_state_dict(
            torch.load(saved_dir + "/critic_checkpoint.pth"))
        self.actor.load_state_dict(
            torch.load(saved_dir + "/actor_checkpoint.pth"))
        self.target_critic.load_state_dict(
            torch.load(saved_dir + "/target_critic_checkpoint.pth"))
        self.target_actor.load_state_dict(
            torch.load(saved_dir + "/target_actor_checkpoint.pth"))

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)