def test(rank, args): env = RunEnv(True) env.seed(args.seed + rank) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states * 4, nb_actions, args) agent.load_weights("weights") agent.is_training = False agent.eval() done = True policy = lambda x: agent.select_action(x, decay_epsilon=False) last_reward = -10 episode = 0 observation = None observations = None episode_reward = 0. step = 0 best_episode_reward = -10 while True: # reset at the start of episode if observation is None: observation = deepcopy(env.reset()) agent.reset(observation) observations = deque( [observation, observation, observation, observation], 4) episode_steps = 0 episode_reward = 0. done = False while not done: action = policy( np.concatenate(list(observations)).ravel().tolist()) observation, reward, done, info = env.step(action) if observation: observations.appendleft(observation) episode_reward += reward episode_steps += 1 step += 1 episode += 1 observation = None observations = None best_episode_reward = max(episode_reward, best_episode_reward) print('#Ep{}: episode_reward:{:.3f} episode_steps:{} '.format( episode, episode_reward, episode_steps))
class Policy_Domain: def __init__(self, observation_space, action_space): self.config = Config() self.agent_ddpg = DDPG(observation_space, action_space, self.config) if not (self.config.env == 'UAV'): self.agent_ddpg.load_weights(self.config.save_trained_models) self.agent_ddpg.eval() self.current_direct_wrong = 'north' self.min_distance_x = 50.0 self.min_distance_y = 50.0 def forward(self, state, time_step, args, reset_flag=False): if args.demo_type == 'uav': if args: coefs = args.variance * 2 prior_decay = args.prior_decay else: coefs = [0.09, 0.09] prior_decay = 0.005 time_step = torch.Tensor([time_step])[0] perspective = torch.atan(state[12] / state[13]) first_perspective = torch.where( state[13] > 0, torch.where(state[12] > 0, perspective / np.pi * 180.0, (perspective + 2 * np.pi) / np.pi * 180.0), (perspective + np.pi) / np.pi * 180.0) target = torch.atan(state[10] / state[11]) position_target = torch.where( state[11] > 0, torch.where(state[10] > 0, target / np.pi * 180.0, (target + 2 * np.pi) / np.pi * 180.0), (target + np.pi) / np.pi * 180.0) first_target = torch.remainder(first_perspective - position_target, 360.0) average_direction = torch.where( torch.sign(180.0 - first_target) + 1.0 > 0, -first_target / 180.0, (360.0 - first_target) / 180.0) variance_direction = 0.0 * average_direction + coefs[0] # 0.1 turning_free = torch.where( torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0, 45.0 + 0 * average_direction, -45.0 + 0 * average_direction) average_free = turning_free / 180.0 variance_free = 0.0 * average_free + coefs[0] # 0.1 average_steer = torch.where( torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, average_direction, average_free) variance_steer = torch.where( torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, variance_direction, variance_free) speed = state[14] average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5, 0.5) variance_throttle = 0.0 * average_throttle + coefs[1] decay = prior_decay * (time_step - 1) + 1 covariance = torch.cat( (variance_steer.unsqueeze_(0), variance_throttle.unsqueeze_(0)), 0) * decay average = torch.cat( (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)), 0) elif args.demo_type == 'uav_wrong': if reset_flag: self.current_direct_wrong = 'north' self.min_distance_x = 50.0 self.min_distance_y = 50.0 if args: coefs = args.variance * 2 prior_decay = args.prior_decay else: coefs = [0.09, 0.09] prior_decay = 0.005 time_step = torch.Tensor([time_step])[0] perspective = torch.atan(state[12] / state[13]) first_perspective = torch.where( state[13] > 0, torch.where(state[12] > 0, perspective / np.pi * 180.0, (perspective + 2 * np.pi) / np.pi * 180.0), (perspective + np.pi) / np.pi * 180.0) target = torch.atan(state[10] / state[11]) position_target = torch.where( state[11] > 0, torch.where(state[10] > 0, target / np.pi * 180.0, (target + 2 * np.pi) / np.pi * 180.0), (target + np.pi) / np.pi * 180.0) distance = (state[9] / 2 + 0.5) * (torch.sqrt(torch.Tensor([2])[0]) * 3000) distance_y = torch.abs(distance * torch.sin( 2 * position_target / 360 * torch.Tensor([np.pi])[0])) distance_x = torch.abs(distance * torch.cos( 2 * position_target / 360 * torch.Tensor([np.pi])[0])) if distance_y > self.min_distance_y: self.current_direct_wrong = 'north' elif distance_x > self.min_distance_x: if self.current_direct_wrong == 'north': self.min_distance_x -= 5 self.current_direct_wrong = 'east' else: if self.current_direct_wrong == 'east': self.min_distance_y -= 5 self.current_direct_wrong = 'north' if self.current_direct_wrong == 'north': if position_target > 0 and position_target < 180: position_target = 90 else: position_target = 270 else: if position_target < 90 or position_target > 270: position_target = 0 else: position_target = 180 first_target = torch.remainder(first_perspective - position_target, 360.0) average_direction = torch.where( torch.sign(180.0 - first_target) + 1.0 > 0, -first_target / 180.0, (360.0 - first_target) / 180.0) variance_direction = 0.0 * average_direction + coefs[0] # 0.1 turning_free = torch.where( torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0, 45.0 + 0 * average_direction, -45.0 + 0 * average_direction) average_free = turning_free / 180.0 variance_free = 0.0 * average_free + coefs[0] # 0.1 average_steer = torch.where( torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, average_direction, average_free) variance_steer = torch.where( torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, variance_direction, variance_free) speed = state[14] average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5, 0.5) variance_throttle = 0.0 * average_throttle + coefs[1] decay = prior_decay * (time_step - 1) + 1 covariance = torch.cat( (variance_steer.unsqueeze_(0), variance_throttle.unsqueeze_(0)), 0) * decay average = torch.cat( (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)), 0) else: average = self.agent_ddpg.select_action(state) time_step = torch.Tensor([time_step])[0] decay = args.prior_decay * (time_step - 1) + 1 covariance = torch.ones(average.shape) * 0.1 * decay return average, covariance def action_sample(self, state, time_step, args): average, covariance = self.forward(state, time_step, args) eps = torch.Tensor(np.random.normal(0, 1, average.shape)) action = average + eps * covariance.sqrt() return action
env_dir = base_dir + env_name + '/' for optimizer in [args.optimizer]: #['RMSprop', 'SGLD_thermal_0.01', 'SGLD_thermal_0.001', 'SGLD_thermal_0.0001', 'SGLD_thermal_1e-05']: for noise_type in [args.action_noise]: noise_dir = env_dir + optimizer + '/' + noise_type + '/nr_mdp_' + str(args.alpha) + '_1/' if os.path.exists(noise_dir): for subdir in sorted(os.listdir(noise_dir)): results = {} run_number = 0 dir = noise_dir + subdir #+ '/' + str(run_number) print(dir) if os.path.exists(noise_dir + subdir)\ and not os.path.isfile(noise_dir + subdir + '/results_' + args.eval_type): while os.path.exists(dir): load_model(agent=agent, basedir=dir) agent.eval() if 'model' in args.eval_type: if 'noise' in args.eval_type: test_episodes = 10 for mass in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]: #np.linspace(0.8, 1.2, 10): if mass not in results: results[mass] = {} for alpha in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]: #np.linspace(0, 0.5, 10): if alpha not in results[mass]: results[mass][alpha] = [] for _ in range(test_episodes): r = eval_model(env, alpha) results[mass][alpha].append(r) else: for mass in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]: #np.linspace(0.8, 1.2, 20):
noise_name=model_args['noise'], buffer_capacity=model_args['buffer_capacity'], batch_size=model_args['batch_size'], gamma=model_args['gamma'], tau=model_args['tau'], episodes=model_args['episodes'], learning_rate=model_args['learning_rate'], episode_length=model_args['episode_length'], actor_layers=model_args['actor_layers'], critic_layers=model_args['critic_layers'], norm=model_args['norm'], log=model_args['log'], log_name=model_args['log_name'], render=model_args['render'], save=model_args['save'], save_path=model_args['save_path']) if model_args['load'] is not None: model.load_model(model_args['load']) if model_args['train']: model.train() if model_args['eval']: r = model.eval(episodes=model_args['eval_episodes'], episode_length=model_args['eval_ep_length'], render=model_args['eval_render']) r_range = env.reward_range print("Evaluation: mean reward = " + str(r) + ", in " + str(model_args['eval_episodes']) + " episodes(length=" + str(model_args['eval_ep_length']) + ", reward-range=" + str(r_range) + ")") env.close()
def test(rank, args, ns, best_result): env = RunEnv(False) env.seed(args.seed + rank) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] if args.use_more_states: agent = DDPG(nb_states * args.num_states, nb_actions, args) else: agent = DDPG(nb_states, nb_actions, args) if args.load_weights: agent.load_weights("weights") agent.is_training = False agent.eval() done = True policy = lambda x: agent.select_action(x, decay_epsilon=False) last_reward = -10 episode = 0 observation = None observations = None episode_reward = 0. step = 0 best_episode_reward = -10 while True: # reset at the start of episode if observation is None: observation = deepcopy(env.reset()) agent.reset(observation) if args.use_more_states: observations = deque( list(observation for i in range(2**args.num_states)), 2**args.num_states) # observations = deque(list(observation for i in range(args.num_states)), args.num_states) if best_result.value > best_episode_reward and step > args.warmup: best_model = deepcopy(ns.best_model) test_agent = deepcopy(agent) test_agent.load_state_dict(best_model.state_dict()) if test_new_state_dict(test_agent, episode_reward, env, use_more_states=args.use_more_states, num_states=args.num_states): agent = test_agent agent.best_reward = best_model.best_reward prRed("updated test agent from ns {:.3f}".format( best_model.best_reward)) last_reward = best_result.value observation = None episode_steps = 0 episode_reward = 0. # start episode if observation is None: observation = deepcopy(env.reset()) agent.reset(observation) if args.use_more_states: observations = deque( list(observation for i in range(2**args.num_states)), 2**args.num_states) # observations = deque(list(observation for i in range(args.num_states)), args.num_states) done = False while not done: if args.use_more_states: cur_observations = list() for i in range(args.num_states): cur_observations.append(list(observations)[2**i - 1]) action = policy( np.concatenate(list(cur_observations)).ravel().tolist()) else: action = policy(observation) observation, reward, done, info = env.step(action) if args.use_more_states and observation: observations.appendleft(observation) episode_reward += reward episode_steps += 1 step += 1 if episode % 50 == 0 and episode != 0: print("saving models") os.makedirs("weights", exist_ok=True) agent.save_model("weights") episode += 1 observation = None observations = None current_best_result = best_result.value best_episode_reward = max(episode_reward, best_episode_reward) best_result.value = max(episode_reward, current_best_result - 0.05) print( '#Ep{}: episode_reward:{:.3f} episode_steps:{} br: {:.3f} -> {:.3f}' .format(episode, episode_reward, episode_steps, current_best_result, best_result.value))
class Policy_Domain: def __init__(self, observation_space, action_space): self.config = Config() self.agent_ddpg = DDPG(observation_space, action_space, self.config) if not (self.config.env == 'UAV'): self.agent_ddpg.load_weights(self.config.save_trained_models) self.agent_ddpg.eval() self.current_direct_wrong = 'north' self.min_distance_x = 50.0 self.min_distance_y = 50.0 def forward(self, state, time_step, args, reset_flag=False): if args.demo_type == 'uav': # SenAvo if args.variance and args.prior_decay: coefs = [args.variance, args.variance] prior_decay = args.prior_decay # prior_decay:为了减小先验策略,引入减小prior_sigma的因子 else: coefs = [0.09, 0.09] prior_decay = 0.005 time_step = torch.Tensor([time_step])[0] perspective = torch.atan(state[12] / state[13]) first_perspective = torch.where( state[13] > 0, # cos朝向角度 (度数,不是pi torch.where(state[12] > 0, perspective / np.pi * 180.0, (perspective + 2 * np.pi) / np.pi * 180.0), (perspective + np.pi) / np.pi * 180.0) target = torch.atan(state[10] / state[11]) # 目标和自己的连线的角度信息 position_target = torch.where( state[11] > 0, torch.where(state[10] > 0, target / np.pi * 180.0, (target + 2 * np.pi) / np.pi * 180.0), (target + np.pi) / np.pi * 180.0) first_target = torch.remainder( # 确定夹角 remainder(input,divisor) 返回一个新张量,包含输入input张量每个元素的除法余数,余数与除数有相同的符号。 first_perspective - position_target, 360.0) average_direction = torch.where( # 规范化夹角 torch.sign()输入一个张量,如果是正数返回1.0,负数返回-1.0。即如果夹角大于180则直接除以180,否则取互补的角度 torch.sign(180.0 - first_target) + 1.0 > 0, -first_target / 180.0, (360.0 - first_target) / 180.0) variance_direction = 0.1 * average_direction + coefs[0] # 0.1 turning_free = torch.where( # argmin:返回指定维度最小的编号。state[0~9]记录的基本方向上的距离信息。最小距离编号大于5(即即将碰撞的方向为左侧)则取前者(正向加45),否则后者 torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0, 45.0 + 0.1 * average_direction, -45.0 + 0.1 * average_direction) # 0 0.1 average_free = turning_free / 180.0 # 调整的方向 variance_free = 0.1 * average_free + coefs[0] # 0.1 average_steer = torch.where( # 最近距离是否大于碰撞距离。如果是不用转向,如果不是就调整方向 torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, average_direction, average_free) variance_steer = torch.where( torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, variance_direction, variance_free) speed = state[14] average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5, 0.5) variance_throttle = 0.1 * average_throttle + coefs[1] # 0.1 decay = prior_decay * (time_step - 1) + 1 covariance = torch.cat( # 按维数0拼接 (variance_steer.unsqueeze_(0), variance_throttle.unsqueeze_(0)), 0) * decay # 公式(25) average = torch.cat( (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)), 0) elif args.demo_type == 'uav_wrong': # Naive if reset_flag: self.current_direct_wrong = 'north' self.min_distance_x = 50.0 self.min_distance_y = 50.0 if args: coefs = args.variance * 2 prior_decay = args.prior_decay else: coefs = [0.09, 0.09] prior_decay = 0.005 time_step = torch.Tensor([time_step])[0] perspective = torch.atan(state[12] / state[13]) first_perspective = torch.where( state[13] > 0, torch.where(state[12] > 0, perspective / np.pi * 180.0, (perspective + 2 * np.pi) / np.pi * 180.0), (perspective + np.pi) / np.pi * 180.0) target = torch.atan(state[10] / state[11]) position_target = torch.where( state[11] > 0, torch.where(state[10] > 0, target / np.pi * 180.0, (target + 2 * np.pi) / np.pi * 180.0), (target + np.pi) / np.pi * 180.0) distance = (state[9] / 2 + 0.5) * \ (torch.sqrt(torch.Tensor([2])[0]) * 3000) distance_y = torch.abs(distance * torch.sin( 2 * position_target / 360 * torch.Tensor([np.pi])[0])) distance_x = torch.abs(distance * torch.cos( 2 * position_target / 360 * torch.Tensor([np.pi])[0])) if distance_y > self.min_distance_y: self.current_direct_wrong = 'north' elif distance_x > self.min_distance_x: if self.current_direct_wrong == 'north': self.min_distance_x -= 5 self.current_direct_wrong = 'east' else: if self.current_direct_wrong == 'east': self.min_distance_y -= 5 self.current_direct_wrong = 'north' if self.current_direct_wrong == 'north': if position_target > 0 and position_target < 180: position_target = 90 else: position_target = 270 else: if position_target < 90 or position_target > 270: position_target = 0 else: position_target = 180 first_target = torch.remainder(first_perspective - position_target, 360.0) average_direction = torch.where( torch.sign(180.0 - first_target) + 1.0 > 0, -first_target / 180.0, (360.0 - first_target) / 180.0) variance_direction = 0.0 * average_direction + coefs[0] # 0.1 turning_free = torch.where( torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0, 45.0 + 0 * average_direction, -45.0 + 0 * average_direction) average_free = turning_free / 180.0 variance_free = 0.0 * average_free + coefs[0] # 0.1 average_steer = torch.where( torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, average_direction, average_free) variance_steer = torch.where( torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0, variance_direction, variance_free) speed = state[14] average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5, 0.5) variance_throttle = 0.0 * average_throttle + coefs[1] decay = prior_decay * (time_step - 1) + 1 covariance = torch.cat( (variance_steer.unsqueeze_(0), variance_throttle.unsqueeze_(0)), 0) * decay average = torch.cat( (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)), 0) else: average = self.agent_ddpg.select_action(state) # 无策略则随便选 time_step = torch.Tensor([time_step])[0] decay = args.prior_decay * (time_step - 1) + 1 covariance = torch.ones(average.shape) * 0.1 * decay return average, covariance def action_sample(self, state, time_step, args): average, covariance = self.forward(state, time_step, args) eps = torch.Tensor(np.random.normal(0, 1, average.shape)) action = average + eps * covariance.sqrt() return action