Exemple #1
0
    def __init__(self, obs_dim, action_dim, id, load_path, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.id = id
        self.memory = ReplayMemory(args.replay_size)
        self.load_path = load_path

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning_low

        use_cuda = torch.cuda.is_available()
        self.device = torch.device(args.GPU if use_cuda else "cpu")
        self.alpha = torch.tensor(args.alpha).to(self.device)

        self.critic = TwinnedQNetwork(
            obs_dim, action_dim,
            args.hidden_size).to(device=self.device).double()
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
        self.critic_target = TwinnedQNetwork(
            obs_dim, action_dim,
            args.hidden_size).to(device=self.device).double()

        hard_update(self.critic_target, self.critic)
        self.Q1_normer = PopArt(self.critic.Q1.last_fc)
        self.Q2_normer = PopArt(self.critic.Q2.last_fc)

        self.Q1_target_normer = PopArt(self.critic_target.Q1.last_fc)
        self.Q2_target_normer = PopArt(self.critic_target.Q2.last_fc)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning == True:
            self.alpha = torch.tensor(1.).to(self.device)
            self.target_entropy = -torch.prod(
                torch.Tensor(action_dim).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device,
                                         dtype=torch.double)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(obs_dim, action_dim, args.hidden_size).to(
            self.device).double()
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
Exemple #2
0
    def __init__(self, obs_dim, option_dim, load_path, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.memory = ReplayMemory(args.replay_size)
        self.load_path = load_path
        self.obs_dim = obs_dim
        self.Beta_add = args.Beta_add
        self.beta_weight = args.beta_weight
        self.update_num = 0

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning_high

        use_cuda = torch.cuda.is_available()
        self.device = torch.device(args.GPU if use_cuda else "cpu")
        self.alpha = torch.tensor(args.alpha).to(self.device)

        self.critic = Q_discrete_Network(
            obs_dim, option_dim,
            args.hidden_size).to(device=self.device).double()
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
        self.critic_target = Q_discrete_Network(
            obs_dim, option_dim,
            args.hidden_size).to(device=self.device).double()

        hard_update(self.critic_target, self.critic)
        self.Q_normer = PopArt(self.critic.last_fc)
        self.Q_target_normer = PopArt(self.critic_target.last_fc)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning == True:
            self.target_entropy = -torch.prod(
                torch.Tensor(option_dim).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device,
                                         dtype=torch.double)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.Beta = Beta_network(obs_dim, option_dim,
                                 args.hidden_size).to(self.device).double()
        self.Beta_optim = Adam(self.Beta.parameters(), lr=args.lr)
    def __init__(self,
                 env,
                 random_seed,
                 save_path,
                 q_net=QNet,
                 gamma=0.99,
                 batch_size=32,
                 initial_eps=1.0,
                 end_eps=0.1,
                 eps_plan=500000,
                 lr=0.00025,
                 learning_start=50000,
                 learning_freq=4,
                 frame_history_len=4,
                 target_update_freq=10000,
                 memory_size=1000000,
                 max_steps=10000000,
                 **kwargs):
        """
		DQN Agent

		paras:
			env: the gym environment
			seed: the random seed
			save_path: the path to save model parameters
			q_net: the Q learning network function
			gamma: the reward's decrease parameter
			initial_e: the initial prob to choose random action
			end_e: the end prob to choose random action
			lr: the optimizer's learning rate
			target_update_freq: the target netwok's update frequency
			test_freq: the test frequency
			learning_start: begin to learn after learning_start steps
			learning_freq: the training frequency 
			frame_history_len: how much frames should be feed to the model as one data
			memory_size: the maxmium size of replay buffer
		"""
        assert type(env.observation_space) == gym.spaces.Box
        assert type(env.action_space) == gym.spaces.Discrete

        # fix random seed
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
        random.seed(random_seed)

        # set env
        self.env = env
        self.test_env = copy.deepcopy(env)
        # get observation dim
        if len(env.observation_space.shape
               ) == 1:  # running on low-dimension observation(RAM)
            self.observation_dim = env.observation_space.shape[0]
        else:
            img_h, img_w, img_c = env.observation_space.shape
            self.observation_dim = frame_history_len
        # get action dim
        self.action_dim = env.action_space.n

        # set Q network
        self.learning_Q = q_net(self.observation_dim, self.action_dim).cuda()
        self.target_Q = q_net(self.observation_dim, self.action_dim).cuda()
        # sync two networks' parameter
        hard_update(self.target_Q, self.learning_Q)

        # set replay buffer
        self.replay_buffer = ReplayBuffer.ReplayBuffer(memory_size,
                                                       frame_history_len)

        # define learning Q network's optimizer
        self.optimizer = torch.optim.RMSprop(self.learning_Q.parameters(),
                                             lr=lr,
                                             eps=0.01)
        # define loss function
        self.loss_func = nn.MSELoss()

        # initial other parameters
        self.gamma = gamma
        self.batch_size = batch_size
        self.initial_eps = initial_eps
        self.end_eps = end_eps
        self.eps_plan = eps_plan
        self.learning_start = learning_start
        self.learning_freq = learning_freq
        self.frame_history_len = frame_history_len
        self.max_steps = max_steps
        self.target_update_freq = target_update_freq
        self.steps = 0
        self.save_path = save_path

        # set the eps
        self.eps = self.initial_eps
    def train(self, is_render=False, path=None):
        last_observation = self.env.reset()
        last_observation = self.pre_process(last_observation)
        mean_episode_reward = -float('nan')
        best_mean_episode_reward = -float('inf')
        log = {'mean_episode_reward': [], 'best_mean_episode_reward': []}
        num_param_updates = 0
        episode_rewards = []
        one_episode_reward = []
        loss = []
        while self.steps < self.max_steps:
            # store lastest observation
            last_index = self.replay_buffer.store_frame(last_observation)

            recent_observation = self.replay_buffer.encoder_recent_observation(
            )
            # choose a random action if not state learning yet
            if self.steps < self.learning_start:
                action = random.randrange(self.action_dim)
            else:
                action = self.get_exploration_action(
                    recent_observation)[0].numpy()

            # make a step
            observation, reward, done, _ = self.env.step(action)
            observation = self.pre_process(observation)
            one_episode_reward.append(reward)
            if is_render:
                self.env.render()

            # clip rewards between -1 and 1
            reward = max(-1.0, min(reward, 1.0))
            # store ohter info in replay memory
            self.replay_buffer.store_effct(last_index, action, reward, done)
            # if done, restat env
            if done:
                observation = self.env.reset()
                observation = self.pre_process(observation)
                episode_rewards.append(np.sum(one_episode_reward))
                one_episode_reward = []
            last_observation = observation

            # perform experience replay and train the network
            if ((self.steps > self.learning_start)
                    and (self.steps % self.learning_freq == 0)
                    and self.replay_buffer.can_sample):
                # get batch from replay buffer
                obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = self.replay_buffer.sample_batch(
                    self.batch_size)
                # turn all data to tensor
                obs_batch = Variable(
                    torch.from_numpy(obs_batch).type(torch.float32) /
                    255.0).cuda()
                act_batch = Variable(torch.from_numpy(act_batch).long()).cuda()
                rew_batch = Variable(
                    torch.from_numpy(rew_batch).type(torch.float32)).cuda()
                next_obs_batch = Variable(
                    torch.from_numpy(next_obs_batch).type(torch.float32) /
                    255.).cuda()
                not_done_mask = Variable(
                    torch.from_numpy(1 - done_mask).type(
                        torch.float32)).cuda()

                # ================================ calculate bellman =========================================
                # get current Q value
                current_q_value = self.learning_Q(obs_batch).gather(
                    1, act_batch.unsqueeze(1))
                # compute next q value based on which action gives max Q values
                next_max_q = self.target_Q(next_obs_batch).max(1)[0]
                next_q_values = not_done_mask * next_max_q
                # compute the target of the current q values
                target_q_values = rew_batch + (self.gamma * next_q_values)
                # compute bellman error
                bellman_error = target_q_values.view(-1, 1) - current_q_value
                loss.append(bellman_error.detach().cpu().numpy())
                # clip bellman error between [-1, 1]
                clipped_bellman_error = bellman_error.clamp(-1, 1)
                # * -1
                bellman_d = -1. * clipped_bellman_error

                # optimize
                self.optimizer.zero_grad()
                current_q_value.backward(bellman_d.data)
                self.optimizer.step()

                # update steps
                num_param_updates += 1
                # update network
                if num_param_updates % self.target_update_freq == 0:
                    hard_update(self.target_Q, self.learning_Q)

            if len(episode_rewards) > 0:
                mean_episode_reward = np.mean(episode_rewards[-100:])
            if len(episode_rewards) > 100:
                best_mean_episode_reward = max(best_mean_episode_reward,
                                               mean_episode_reward)

            log['mean_episode_reward'].append(mean_episode_reward)
            log['best_mean_episode_reward'].append(best_mean_episode_reward)

            if self.steps % 5000 == 0 and self.steps > self.learning_start:
                print("Steps: {}".format(self.steps))
                print("mean reward (lastest 100 episodes): {:.4f}".format(
                    mean_episode_reward))
                print("best mean reward: {:.4f}".format(
                    best_mean_episode_reward))
                print("episodes: {}".format(len(episode_rewards)))
                print("exploration: {:.4f}".format(self.eps))
                print("loss: {:.4f}".format(np.mean(loss)))
                sys.stdout.flush()

                loss = []

                with open(self.save_path + 'log.pkl', 'wb') as f:
                    pickle.dump(log, f)

                self.save_model('DQNtest', path=path)

            self.steps += 1
 def load_model(self, file_path):
     self.target_Q.load_state_dict(torch.load(file_path))
     hard_update(self.learning_Q, self.target_Q)
     print("The models' parameters have been loaded sucessfully!")
Exemple #6
0
    def train(self):
        #if self.replay_buffer.num_transition >= self.start_train_loop and self.replay_buffer.num_transition % 1000 == 999:
        if self.step_num - self.train_step >= self.start_train_loop:
            for tini_train in range(50):
                self.option_framework.update_parameters(
                    self.batch_size, self.writer, self.logging)
            for tini_train in range(200):
                for i in range(self.option_dim):
                    state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.skills[
                        i].memory.sample(batch_size=self.batch_size)
                    state_batch = torch.tensor(state_batch).double().to(
                        self.device)
                    next_state_batch = torch.tensor(
                        next_state_batch).double().to(self.device)
                    action_batch = torch.tensor(action_batch).double().to(
                        self.device)
                    reward_batch = torch.tensor(reward_batch).double().to(
                        self.device).unsqueeze(1)
                    mask_batch = torch.tensor(mask_batch).double().to(
                        self.device).unsqueeze(1)

                    forward_MI_reward = self.compute_forward_MIreward(
                        state_batch, next_state_batch, i).view(-1, 1)
                    reverse_MI_reward = self.compute_reverse_MIreward(
                        state_batch, i, action_batch).view(-1, 1)

                    reward_batch = reward_batch + forward_MI_reward.detach(
                    ) + reverse_MI_reward.detach()

                    qf1_loss, qf2_loss, policy_loss, alpha_loss, alpha_tlogs = self.skills[
                        i].update_parameters(state_batch, action_batch,
                                             reward_batch, next_state_batch,
                                             mask_batch, self.train_step)
                    self.logging.info(
                        f'id: {i}, qf1_loss: {qf1_loss}, qf2_loss: {qf2_loss}, policy_loss: {policy_loss}, alpha: {alpha_tlogs}'
                    )

                    if self.writer and tini_train % 10 == 9:
                        self.writer.add_scalar(f'train_skills{i}/qf1_loss',
                                               qf1_loss,
                                               tini_train + self.train_step)
                        self.writer.add_scalar(f'train_skills{i}/qf2_loss',
                                               qf2_loss,
                                               tini_train + self.train_step)
                        self.writer.add_scalar(f'train_skills{i}/policy_loss',
                                               policy_loss,
                                               tini_train + self.train_step)
                        self.writer.add_scalar(f'train_skills{i}/alpha_loss',
                                               alpha_loss,
                                               tini_train + self.train_step)
                        self.writer.add_scalar(f'train_skills{i}/alpha',
                                               alpha_tlogs,
                                               tini_train + self.train_step)

            hard_update(self.forward_net_target, self.forward_net)
            self.train_step += 200
            if self.train_step % 50000 == 0:
                self.option_framework.save_model(self.train_step)
                for i in range(self.option_dim):
                    self.skills[i].save_model(self.train_step)
            print(
                "----------------------------------------------------------------------"
            )
            print(f"Skills Train step: {self.train_step}")
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
            print(
                "----------------------------------------------------------------------"
            )
Exemple #7
0
    def __init__(
        self,
        args,
        env,
        obs_dim,
        action_dim,
        option_dim,
        device,
        csv_path,
        fig_path,
        load_path,
        logging,
        start_train_loop=5000,
        test_step_oneloop=10000,
        start_usenet_step=5000,
        buffer_capacity=1000000,
        length=1000000,
        writer=None,
    ):

        super().__init__()

        self.skills = [
            SAC_continuous(obs_dim, action_dim, i, load_path, args)
            for i in range(option_dim)
        ]
        self.option_framework = SAC_discrete(obs_dim, option_dim, load_path,
                                             args)
        self.forward_net = Plus_Net([256, 256],
                                    obs_dim,
                                    obs_dim,
                                    option_dim,
                                    layer_norm=False).to(device).double()
        self.forward_net_target = Plus_Net(
            [256, 256], obs_dim, obs_dim, option_dim,
            layer_norm=False).to(device).double()
        hard_update(self.forward_net_target, self.forward_net)

        self.forward_net_optimizer = torch.optim.Adam(
            self.forward_net.parameters(), lr=args.lr)
        self.mse_loss = nn.MSELoss(reduction='none')
        self.env = env
        self.logging = logging
        self.fixstart = args.fixstart

        self.device = device
        self.csv_path = csv_path
        self.fig_path = fig_path
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.option_dim = option_dim

        self.batch_size = args.batch_size
        self.length = length
        self.writer = writer

        self.test_step_oneloop = test_step_oneloop
        self.start_train_loop = start_train_loop
        self.train_step = 0
        self.test_step = 0
        self.start_usenet_step = start_usenet_step

        self.step_num = 0
        self.episode_num = 0
        self.pi_z = [0 for i in range(option_dim)]
        self.pi_cur_z = [0 for i in range(option_dim)]

        self.cmap = sns.color_palette("Set1", option_dim, 0.9)
        self.dense_reward_skills = args.dense_reward_skills
        self.dense_reward_options = args.dense_reward_options