def __init__(self, obs_dim, action_dim, id, load_path, args): self.gamma = args.gamma self.tau = args.tau self.id = id self.memory = ReplayMemory(args.replay_size) self.load_path = load_path self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning_low use_cuda = torch.cuda.is_available() self.device = torch.device(args.GPU if use_cuda else "cpu") self.alpha = torch.tensor(args.alpha).to(self.device) self.critic = TwinnedQNetwork( obs_dim, action_dim, args.hidden_size).to(device=self.device).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = TwinnedQNetwork( obs_dim, action_dim, args.hidden_size).to(device=self.device).double() hard_update(self.critic_target, self.critic) self.Q1_normer = PopArt(self.critic.Q1.last_fc) self.Q2_normer = PopArt(self.critic.Q2.last_fc) self.Q1_target_normer = PopArt(self.critic_target.Q1.last_fc) self.Q2_target_normer = PopArt(self.critic_target.Q2.last_fc) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.alpha = torch.tensor(1.).to(self.device) self.target_entropy = -torch.prod( torch.Tensor(action_dim).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device, dtype=torch.double) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(obs_dim, action_dim, args.hidden_size).to( self.device).double() self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, obs_dim, option_dim, load_path, args): self.gamma = args.gamma self.tau = args.tau self.memory = ReplayMemory(args.replay_size) self.load_path = load_path self.obs_dim = obs_dim self.Beta_add = args.Beta_add self.beta_weight = args.beta_weight self.update_num = 0 self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning_high use_cuda = torch.cuda.is_available() self.device = torch.device(args.GPU if use_cuda else "cpu") self.alpha = torch.tensor(args.alpha).to(self.device) self.critic = Q_discrete_Network( obs_dim, option_dim, args.hidden_size).to(device=self.device).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = Q_discrete_Network( obs_dim, option_dim, args.hidden_size).to(device=self.device).double() hard_update(self.critic_target, self.critic) self.Q_normer = PopArt(self.critic.last_fc) self.Q_target_normer = PopArt(self.critic_target.last_fc) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(option_dim).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device, dtype=torch.double) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.Beta = Beta_network(obs_dim, option_dim, args.hidden_size).to(self.device).double() self.Beta_optim = Adam(self.Beta.parameters(), lr=args.lr)
def __init__(self, env, random_seed, save_path, q_net=QNet, gamma=0.99, batch_size=32, initial_eps=1.0, end_eps=0.1, eps_plan=500000, lr=0.00025, learning_start=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, memory_size=1000000, max_steps=10000000, **kwargs): """ DQN Agent paras: env: the gym environment seed: the random seed save_path: the path to save model parameters q_net: the Q learning network function gamma: the reward's decrease parameter initial_e: the initial prob to choose random action end_e: the end prob to choose random action lr: the optimizer's learning rate target_update_freq: the target netwok's update frequency test_freq: the test frequency learning_start: begin to learn after learning_start steps learning_freq: the training frequency frame_history_len: how much frames should be feed to the model as one data memory_size: the maxmium size of replay buffer """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete # fix random seed torch.manual_seed(random_seed) np.random.seed(random_seed) random.seed(random_seed) # set env self.env = env self.test_env = copy.deepcopy(env) # get observation dim if len(env.observation_space.shape ) == 1: # running on low-dimension observation(RAM) self.observation_dim = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape self.observation_dim = frame_history_len # get action dim self.action_dim = env.action_space.n # set Q network self.learning_Q = q_net(self.observation_dim, self.action_dim).cuda() self.target_Q = q_net(self.observation_dim, self.action_dim).cuda() # sync two networks' parameter hard_update(self.target_Q, self.learning_Q) # set replay buffer self.replay_buffer = ReplayBuffer.ReplayBuffer(memory_size, frame_history_len) # define learning Q network's optimizer self.optimizer = torch.optim.RMSprop(self.learning_Q.parameters(), lr=lr, eps=0.01) # define loss function self.loss_func = nn.MSELoss() # initial other parameters self.gamma = gamma self.batch_size = batch_size self.initial_eps = initial_eps self.end_eps = end_eps self.eps_plan = eps_plan self.learning_start = learning_start self.learning_freq = learning_freq self.frame_history_len = frame_history_len self.max_steps = max_steps self.target_update_freq = target_update_freq self.steps = 0 self.save_path = save_path # set the eps self.eps = self.initial_eps
def train(self, is_render=False, path=None): last_observation = self.env.reset() last_observation = self.pre_process(last_observation) mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') log = {'mean_episode_reward': [], 'best_mean_episode_reward': []} num_param_updates = 0 episode_rewards = [] one_episode_reward = [] loss = [] while self.steps < self.max_steps: # store lastest observation last_index = self.replay_buffer.store_frame(last_observation) recent_observation = self.replay_buffer.encoder_recent_observation( ) # choose a random action if not state learning yet if self.steps < self.learning_start: action = random.randrange(self.action_dim) else: action = self.get_exploration_action( recent_observation)[0].numpy() # make a step observation, reward, done, _ = self.env.step(action) observation = self.pre_process(observation) one_episode_reward.append(reward) if is_render: self.env.render() # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # store ohter info in replay memory self.replay_buffer.store_effct(last_index, action, reward, done) # if done, restat env if done: observation = self.env.reset() observation = self.pre_process(observation) episode_rewards.append(np.sum(one_episode_reward)) one_episode_reward = [] last_observation = observation # perform experience replay and train the network if ((self.steps > self.learning_start) and (self.steps % self.learning_freq == 0) and self.replay_buffer.can_sample): # get batch from replay buffer obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = self.replay_buffer.sample_batch( self.batch_size) # turn all data to tensor obs_batch = Variable( torch.from_numpy(obs_batch).type(torch.float32) / 255.0).cuda() act_batch = Variable(torch.from_numpy(act_batch).long()).cuda() rew_batch = Variable( torch.from_numpy(rew_batch).type(torch.float32)).cuda() next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(torch.float32) / 255.).cuda() not_done_mask = Variable( torch.from_numpy(1 - done_mask).type( torch.float32)).cuda() # ================================ calculate bellman ========================================= # get current Q value current_q_value = self.learning_Q(obs_batch).gather( 1, act_batch.unsqueeze(1)) # compute next q value based on which action gives max Q values next_max_q = self.target_Q(next_obs_batch).max(1)[0] next_q_values = not_done_mask * next_max_q # compute the target of the current q values target_q_values = rew_batch + (self.gamma * next_q_values) # compute bellman error bellman_error = target_q_values.view(-1, 1) - current_q_value loss.append(bellman_error.detach().cpu().numpy()) # clip bellman error between [-1, 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # * -1 bellman_d = -1. * clipped_bellman_error # optimize self.optimizer.zero_grad() current_q_value.backward(bellman_d.data) self.optimizer.step() # update steps num_param_updates += 1 # update network if num_param_updates % self.target_update_freq == 0: hard_update(self.target_Q, self.learning_Q) if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) log['mean_episode_reward'].append(mean_episode_reward) log['best_mean_episode_reward'].append(best_mean_episode_reward) if self.steps % 5000 == 0 and self.steps > self.learning_start: print("Steps: {}".format(self.steps)) print("mean reward (lastest 100 episodes): {:.4f}".format( mean_episode_reward)) print("best mean reward: {:.4f}".format( best_mean_episode_reward)) print("episodes: {}".format(len(episode_rewards))) print("exploration: {:.4f}".format(self.eps)) print("loss: {:.4f}".format(np.mean(loss))) sys.stdout.flush() loss = [] with open(self.save_path + 'log.pkl', 'wb') as f: pickle.dump(log, f) self.save_model('DQNtest', path=path) self.steps += 1
def load_model(self, file_path): self.target_Q.load_state_dict(torch.load(file_path)) hard_update(self.learning_Q, self.target_Q) print("The models' parameters have been loaded sucessfully!")
def train(self): #if self.replay_buffer.num_transition >= self.start_train_loop and self.replay_buffer.num_transition % 1000 == 999: if self.step_num - self.train_step >= self.start_train_loop: for tini_train in range(50): self.option_framework.update_parameters( self.batch_size, self.writer, self.logging) for tini_train in range(200): for i in range(self.option_dim): state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.skills[ i].memory.sample(batch_size=self.batch_size) state_batch = torch.tensor(state_batch).double().to( self.device) next_state_batch = torch.tensor( next_state_batch).double().to(self.device) action_batch = torch.tensor(action_batch).double().to( self.device) reward_batch = torch.tensor(reward_batch).double().to( self.device).unsqueeze(1) mask_batch = torch.tensor(mask_batch).double().to( self.device).unsqueeze(1) forward_MI_reward = self.compute_forward_MIreward( state_batch, next_state_batch, i).view(-1, 1) reverse_MI_reward = self.compute_reverse_MIreward( state_batch, i, action_batch).view(-1, 1) reward_batch = reward_batch + forward_MI_reward.detach( ) + reverse_MI_reward.detach() qf1_loss, qf2_loss, policy_loss, alpha_loss, alpha_tlogs = self.skills[ i].update_parameters(state_batch, action_batch, reward_batch, next_state_batch, mask_batch, self.train_step) self.logging.info( f'id: {i}, qf1_loss: {qf1_loss}, qf2_loss: {qf2_loss}, policy_loss: {policy_loss}, alpha: {alpha_tlogs}' ) if self.writer and tini_train % 10 == 9: self.writer.add_scalar(f'train_skills{i}/qf1_loss', qf1_loss, tini_train + self.train_step) self.writer.add_scalar(f'train_skills{i}/qf2_loss', qf2_loss, tini_train + self.train_step) self.writer.add_scalar(f'train_skills{i}/policy_loss', policy_loss, tini_train + self.train_step) self.writer.add_scalar(f'train_skills{i}/alpha_loss', alpha_loss, tini_train + self.train_step) self.writer.add_scalar(f'train_skills{i}/alpha', alpha_tlogs, tini_train + self.train_step) hard_update(self.forward_net_target, self.forward_net) self.train_step += 200 if self.train_step % 50000 == 0: self.option_framework.save_model(self.train_step) for i in range(self.option_dim): self.skills[i].save_model(self.train_step) print( "----------------------------------------------------------------------" ) print(f"Skills Train step: {self.train_step}") print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print( "----------------------------------------------------------------------" )
def __init__( self, args, env, obs_dim, action_dim, option_dim, device, csv_path, fig_path, load_path, logging, start_train_loop=5000, test_step_oneloop=10000, start_usenet_step=5000, buffer_capacity=1000000, length=1000000, writer=None, ): super().__init__() self.skills = [ SAC_continuous(obs_dim, action_dim, i, load_path, args) for i in range(option_dim) ] self.option_framework = SAC_discrete(obs_dim, option_dim, load_path, args) self.forward_net = Plus_Net([256, 256], obs_dim, obs_dim, option_dim, layer_norm=False).to(device).double() self.forward_net_target = Plus_Net( [256, 256], obs_dim, obs_dim, option_dim, layer_norm=False).to(device).double() hard_update(self.forward_net_target, self.forward_net) self.forward_net_optimizer = torch.optim.Adam( self.forward_net.parameters(), lr=args.lr) self.mse_loss = nn.MSELoss(reduction='none') self.env = env self.logging = logging self.fixstart = args.fixstart self.device = device self.csv_path = csv_path self.fig_path = fig_path self.obs_dim = obs_dim self.action_dim = action_dim self.option_dim = option_dim self.batch_size = args.batch_size self.length = length self.writer = writer self.test_step_oneloop = test_step_oneloop self.start_train_loop = start_train_loop self.train_step = 0 self.test_step = 0 self.start_usenet_step = start_usenet_step self.step_num = 0 self.episode_num = 0 self.pi_z = [0 for i in range(option_dim)] self.pi_cur_z = [0 for i in range(option_dim)] self.cmap = sns.color_palette("Set1", option_dim, 0.9) self.dense_reward_skills = args.dense_reward_skills self.dense_reward_options = args.dense_reward_options