def __init__(self, num_inputs, action_space, param, action_bound): self.action_bound = action_bound self.alpha = param['alpha'] self.gamma = param['gamma'] self.tau = param['tau'] self.target_update_interval = param['target_update_interval'] self.automatic_entropy_tuning = param['automatic_entropy_tuning'] self.lr = param['lr'] self.device = torch.device("cuda" if param['cuda'] else "cpu") self.critic = QNetwork(num_inputs, action_space).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(num_inputs, action_space).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = -dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=param['lr']) self.policy = CNNPolicy(num_inputs, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=param['lr'])
def run(): # Set parameters of env LASER_HIST = 3 NUM_ENV = 1 # the number of agents in the environment OBS_SIZE = 512 # number of leaserbeam action_bound = [[0, -1], [1, 1]] # the limitation of velocity # Set env and agent policy env = StageWorld( OBS_SIZE, index=0, num_env=NUM_ENV ) #index is useful for parallel programming, 0 is for the first agent trained_model_file = os.path.dirname(__file__) + '/policy/stage2.pth' policy = CNNPolicy(frames=LASER_HIST, action_space=2) policy.cpu() # policy.cuda() for gpu state_dict = torch.load( trained_model_file, map_location=torch.device( 'cpu')) #torch.load(trained_model_file) for gpu policy.load_state_dict(state_dict) rospy.init_node('rl_collision_avoidance_tb3', anonymous=False) print( '==================================\nrl_collision_avoidance node started' ) nn_tb3 = NN_tb3(env, policy, action_bound, OBS_SIZE, index=0, num_env=NUM_ENV) rospy.on_shutdown(nn_tb3.on_shutdown) rospy.spin()
logger_cal.setLevel(logging.INFO) cal_f_handler = logging.FileHandler(cal_file, mode='a') file_handler.setLevel(logging.INFO) logger_cal.addHandler(cal_f_handler) # logger.info('rosport: %d robotIndex: %d rank:%d' %(arg_list.rosports[rank],arg_list.robotIds[rank],rank)) reward = None action_bound = [[0, -1], [1, 1]] # torch.manual_seed(1) # np.random.seed(1) if rank == 0: policy_path = policydir # policy_path = 'policy' # policy = MLPPolicy(obs_size, act_size) policy = CNNPolicy(frames=LASER_HIST, action_space=2) policy.cuda() opt = Adam(policy.parameters(), lr=LEARNING_RATE) mse = nn.MSELoss() if not os.path.exists(policy_path): os.makedirs(policy_path) file = policy_path + '/Stage1_%d' % ID_EP if os.path.exists(file): logger.info('####################################') logger.info('############Loading Model###########') logger.info('####################################') state_dict = torch.load(file) policy.load_state_dict(state_dict) else:
class SAC(object): def __init__(self, num_inputs, action_space, param, action_bound): self.action_bound = action_bound self.alpha = param['alpha'] self.gamma = param['gamma'] self.tau = param['tau'] self.target_update_interval = param['target_update_interval'] self.automatic_entropy_tuning = param['automatic_entropy_tuning'] self.lr = param['lr'] self.device = torch.device("cuda" if param['cuda'] else "cpu") self.critic = QNetwork(num_inputs, action_space).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(num_inputs, action_space).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = -dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=param['lr']) self.policy = CNNPolicy(num_inputs, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=param['lr']) def select_action(self, state, evaluate=False): #state = torch.FloatTensor(state).to(self.device).unsqueeze(0) obs_stack, s_list, speed = state obs_stack = Variable(torch.from_numpy(obs_stack)).float().cuda() goal_list = Variable(torch.from_numpy(goal_list)).float().cuda() speed = Variable(torch.from_numpy(speed)).float().cuda() if evaluate is False: action, _, _ = self.policy.sample(obs_stack, goal_list, speed) else: _, _, action = self.policy.sample(obs_stack, goal_list, speed) a = a.data.cpu().numpy() scaled_action = np.clip(a, a_min=slef.action_bound[0], a_max=self.action_bound[1]) return scaled_action def update_parameters(self, memory, updates): state_batch, action_batch, reward_batch, next_state_batch, done_batch = memory obss, goals, speeds = state_batch obss_, goals_, speeds_ = next_state_batch # state_batch = torch.FloatTensor(state_batch).to(self.device) # next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) obss = torch.FloatTensor(obss).to(self.device) goals = torch.FloatTensor(goals).to(self.device) speeds = torch.FloatTensor(speeds).to(self.device) obss_ = torch.FloatTensor(obss_).to(self.device) goals_ = torch.FloatTensor(goals_).to(self.device) speeds_ = torch.FloatTensor(speeds_).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) done_batch = torch.FloatTensor(done_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( obss_, goals_, speeds_) qf1_next_target, qf2_next_target = self.critic_target( obss_, goals_, speeds_, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + done_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( obss, goals, speeds, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss(qf1, next_q_value) qf2_loss = F.mse_loss(qf2, next_q_value) pi, log_pi, _ = self.policy.sample(obss, goals, speeds) qf1_pi, qf2_pi = self.critic(obss, goals, speeds, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() def get_policy_state_dict(self): return self.policy.state_dict() def load_policy_state_dict(self, statedict): self.policy.load_state_dict(statedict) def load_model(self, actor_path, critic_path): if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path)
# Set parameters of env LASER_HIST = 3 NUM_ENV = 10 # the number of agents in the environment OBS_SIZE = 512 # number of leaserbeam action_bound = [[0, -1], [1, 1]] # the limitation of velocity # Set env and agent policy env = StageWorld( OBS_SIZE, index=0, num_env=NUM_ENV ) #index is useful for parallel programming, 0 is for the first agent env.reset_world() env.reset_pose() trained_model_file = 'policy/stage2.pth' policy = CNNPolicy(frames=LASER_HIST, action_space=2) policy.cpu() # policy.cuda() for gpu state_dict = torch.load( trained_model_file, map_location=torch.device('cpu')) #torch.load(trained_model_file) for gpu policy.load_state_dict(state_dict) # Set fake obstacles by fake lidar info obstacles_location = [] for i in range(NUM_ENV - 1): obstacles_location.append(np.zeros(512)) obstacles_location[i][10 * i:10 * i + 10] = 0.1 * i + 0.05 # Configure agent as host obs = env.get_laser_observation() for i in range(NUM_ENV - 1):