def test(rank, args):

    env = RunEnv(True)
    env.seed(args.seed + rank)
    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]

    agent = DDPG(nb_states * 4, nb_actions, args)
    agent.load_weights("weights")
    agent.is_training = False
    agent.eval()

    done = True
    policy = lambda x: agent.select_action(x, decay_epsilon=False)
    last_reward = -10
    episode = 0
    observation = None
    observations = None
    episode_reward = 0.
    step = 0
    best_episode_reward = -10
    while True:
        # reset at the start of episode
        if observation is None:
            observation = deepcopy(env.reset())
            agent.reset(observation)
            observations = deque(
                [observation, observation, observation, observation], 4)

        episode_steps = 0
        episode_reward = 0.

        done = False
        while not done:

            action = policy(
                np.concatenate(list(observations)).ravel().tolist())

            observation, reward, done, info = env.step(action)
            if observation:
                observations.appendleft(observation)

            episode_reward += reward
            episode_steps += 1
            step += 1

        episode += 1
        observation = None
        observations = None
        best_episode_reward = max(episode_reward, best_episode_reward)
        print('#Ep{}: episode_reward:{:.3f} episode_steps:{} '.format(
            episode, episode_reward, episode_steps))
Beispiel #2
0
class Policy_Domain:
    def __init__(self, observation_space, action_space):
        self.config = Config()
        self.agent_ddpg = DDPG(observation_space, action_space, self.config)
        if not (self.config.env == 'UAV'):
            self.agent_ddpg.load_weights(self.config.save_trained_models)
            self.agent_ddpg.eval()

        self.current_direct_wrong = 'north'
        self.min_distance_x = 50.0
        self.min_distance_y = 50.0

    def forward(self, state, time_step, args, reset_flag=False):
        if args.demo_type == 'uav':
            if args:
                coefs = args.variance * 2
                prior_decay = args.prior_decay
            else:
                coefs = [0.09, 0.09]
                prior_decay = 0.005
            time_step = torch.Tensor([time_step])[0]
            perspective = torch.atan(state[12] / state[13])
            first_perspective = torch.where(
                state[13] > 0,
                torch.where(state[12] > 0, perspective / np.pi * 180.0,
                            (perspective + 2 * np.pi) / np.pi * 180.0),
                (perspective + np.pi) / np.pi * 180.0)

            target = torch.atan(state[10] / state[11])
            position_target = torch.where(
                state[11] > 0,
                torch.where(state[10] > 0, target / np.pi * 180.0,
                            (target + 2 * np.pi) / np.pi * 180.0),
                (target + np.pi) / np.pi * 180.0)

            first_target = torch.remainder(first_perspective - position_target,
                                           360.0)

            average_direction = torch.where(
                torch.sign(180.0 - first_target) + 1.0 > 0,
                -first_target / 180.0, (360.0 - first_target) / 180.0)
            variance_direction = 0.0 * average_direction + coefs[0]  # 0.1

            turning_free = torch.where(
                torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0,
                45.0 + 0 * average_direction, -45.0 + 0 * average_direction)
            average_free = turning_free / 180.0
            variance_free = 0.0 * average_free + coefs[0]  # 0.1

            average_steer = torch.where(
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                average_direction, average_free)
            variance_steer = torch.where(
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                variance_direction, variance_free)

            speed = state[14]
            average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5,
                                           0.5)

            variance_throttle = 0.0 * average_throttle + coefs[1]
            decay = prior_decay * (time_step - 1) + 1

            covariance = torch.cat(
                (variance_steer.unsqueeze_(0),
                 variance_throttle.unsqueeze_(0)), 0) * decay

            average = torch.cat(
                (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)),
                0)

        elif args.demo_type == 'uav_wrong':
            if reset_flag:
                self.current_direct_wrong = 'north'
                self.min_distance_x = 50.0
                self.min_distance_y = 50.0

            if args:
                coefs = args.variance * 2
                prior_decay = args.prior_decay
            else:
                coefs = [0.09, 0.09]
                prior_decay = 0.005
            time_step = torch.Tensor([time_step])[0]
            perspective = torch.atan(state[12] / state[13])
            first_perspective = torch.where(
                state[13] > 0,
                torch.where(state[12] > 0, perspective / np.pi * 180.0,
                            (perspective + 2 * np.pi) / np.pi * 180.0),
                (perspective + np.pi) / np.pi * 180.0)

            target = torch.atan(state[10] / state[11])
            position_target = torch.where(
                state[11] > 0,
                torch.where(state[10] > 0, target / np.pi * 180.0,
                            (target + 2 * np.pi) / np.pi * 180.0),
                (target + np.pi) / np.pi * 180.0)

            distance = (state[9] / 2 +
                        0.5) * (torch.sqrt(torch.Tensor([2])[0]) * 3000)

            distance_y = torch.abs(distance * torch.sin(
                2 * position_target / 360 * torch.Tensor([np.pi])[0]))
            distance_x = torch.abs(distance * torch.cos(
                2 * position_target / 360 * torch.Tensor([np.pi])[0]))

            if distance_y > self.min_distance_y:
                self.current_direct_wrong = 'north'
            elif distance_x > self.min_distance_x:
                if self.current_direct_wrong == 'north':
                    self.min_distance_x -= 5
                self.current_direct_wrong = 'east'
            else:
                if self.current_direct_wrong == 'east':
                    self.min_distance_y -= 5
                self.current_direct_wrong = 'north'

            if self.current_direct_wrong == 'north':
                if position_target > 0 and position_target < 180:
                    position_target = 90
                else:
                    position_target = 270

            else:
                if position_target < 90 or position_target > 270:
                    position_target = 0
                else:
                    position_target = 180

            first_target = torch.remainder(first_perspective - position_target,
                                           360.0)

            average_direction = torch.where(
                torch.sign(180.0 - first_target) + 1.0 > 0,
                -first_target / 180.0, (360.0 - first_target) / 180.0)
            variance_direction = 0.0 * average_direction + coefs[0]  # 0.1

            turning_free = torch.where(
                torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0,
                45.0 + 0 * average_direction, -45.0 + 0 * average_direction)

            average_free = turning_free / 180.0
            variance_free = 0.0 * average_free + coefs[0]  # 0.1
            average_steer = torch.where(
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                average_direction, average_free)
            variance_steer = torch.where(
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                variance_direction, variance_free)

            speed = state[14]
            average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5,
                                           0.5)
            variance_throttle = 0.0 * average_throttle + coefs[1]

            decay = prior_decay * (time_step - 1) + 1

            covariance = torch.cat(
                (variance_steer.unsqueeze_(0),
                 variance_throttle.unsqueeze_(0)), 0) * decay
            average = torch.cat(
                (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)),
                0)
        else:
            average = self.agent_ddpg.select_action(state)
            time_step = torch.Tensor([time_step])[0]
            decay = args.prior_decay * (time_step - 1) + 1
            covariance = torch.ones(average.shape) * 0.1 * decay

        return average, covariance

    def action_sample(self, state, time_step, args):
        average, covariance = self.forward(state, time_step, args)
        eps = torch.Tensor(np.random.normal(0, 1, average.shape))
        action = average + eps * covariance.sqrt()
        return action
    env_dir = base_dir + env_name + '/'
    for optimizer in [args.optimizer]: #['RMSprop', 'SGLD_thermal_0.01', 'SGLD_thermal_0.001', 'SGLD_thermal_0.0001', 'SGLD_thermal_1e-05']:
        for noise_type in [args.action_noise]: 
            noise_dir = env_dir + optimizer + '/' + noise_type + '/nr_mdp_' + str(args.alpha) + '_1/'	
            if os.path.exists(noise_dir):
                for subdir in sorted(os.listdir(noise_dir)):
                    results = {}
                    
                    run_number = 0
                    dir = noise_dir + subdir #+ '/' + str(run_number)
                    print(dir)
                    if os.path.exists(noise_dir + subdir)\
              		and not os.path.isfile(noise_dir + subdir + '/results_' + args.eval_type):
                        while os.path.exists(dir):
                            load_model(agent=agent, basedir=dir)
                            agent.eval()

                            if 'model' in args.eval_type:
                                if 'noise' in args.eval_type:
                                    test_episodes = 10
                                    for mass in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]: #np.linspace(0.8, 1.2, 10):
                                        if mass not in results:
                                            results[mass] = {}
                                        for alpha in [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]: #np.linspace(0, 0.5, 10):
                                            if alpha not in results[mass]:
                                                results[mass][alpha] = []
                                            for _ in range(test_episodes):
                                                r = eval_model(env, alpha)
                                                results[mass][alpha].append(r)
                                else:
                                    for mass in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]: #np.linspace(0.8, 1.2, 20):
Beispiel #4
0
                 noise_name=model_args['noise'],
                 buffer_capacity=model_args['buffer_capacity'],
                 batch_size=model_args['batch_size'],
                 gamma=model_args['gamma'],
                 tau=model_args['tau'],
                 episodes=model_args['episodes'],
                 learning_rate=model_args['learning_rate'],
                 episode_length=model_args['episode_length'],
                 actor_layers=model_args['actor_layers'],
                 critic_layers=model_args['critic_layers'],
                 norm=model_args['norm'],
                 log=model_args['log'],
                 log_name=model_args['log_name'],
                 render=model_args['render'],
                 save=model_args['save'],
                 save_path=model_args['save_path'])
    if model_args['load'] is not None:
        model.load_model(model_args['load'])
    if model_args['train']:
        model.train()
    if model_args['eval']:
        r = model.eval(episodes=model_args['eval_episodes'],
                       episode_length=model_args['eval_ep_length'],
                       render=model_args['eval_render'])
        r_range = env.reward_range
        print("Evaluation: mean reward = " + str(r) + ", in " +
              str(model_args['eval_episodes']) + " episodes(length=" +
              str(model_args['eval_ep_length']) + ", reward-range=" +
              str(r_range) + ")")
    env.close()
Beispiel #5
0
def test(rank, args, ns, best_result):

    env = RunEnv(False)
    env.seed(args.seed + rank)
    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]
    if args.use_more_states:
        agent = DDPG(nb_states * args.num_states, nb_actions, args)
    else:
        agent = DDPG(nb_states, nb_actions, args)

    if args.load_weights:
        agent.load_weights("weights")
    agent.is_training = False
    agent.eval()

    done = True
    policy = lambda x: agent.select_action(x, decay_epsilon=False)
    last_reward = -10
    episode = 0
    observation = None
    observations = None
    episode_reward = 0.
    step = 0
    best_episode_reward = -10
    while True:
        # reset at the start of episode
        if observation is None:
            observation = deepcopy(env.reset())
            agent.reset(observation)
            if args.use_more_states:
                observations = deque(
                    list(observation for i in range(2**args.num_states)),
                    2**args.num_states)
                # observations = deque(list(observation for i in range(args.num_states)), args.num_states)

        if best_result.value > best_episode_reward and step > args.warmup:
            best_model = deepcopy(ns.best_model)
            test_agent = deepcopy(agent)
            test_agent.load_state_dict(best_model.state_dict())
            if test_new_state_dict(test_agent,
                                   episode_reward,
                                   env,
                                   use_more_states=args.use_more_states,
                                   num_states=args.num_states):
                agent = test_agent
                agent.best_reward = best_model.best_reward
                prRed("updated test agent from ns {:.3f}".format(
                    best_model.best_reward))
            last_reward = best_result.value
            observation = None

        episode_steps = 0
        episode_reward = 0.

        # start episode
        if observation is None:
            observation = deepcopy(env.reset())
            agent.reset(observation)
            if args.use_more_states:
                observations = deque(
                    list(observation for i in range(2**args.num_states)),
                    2**args.num_states)
                # observations = deque(list(observation for i in range(args.num_states)), args.num_states)

        done = False
        while not done:
            if args.use_more_states:
                cur_observations = list()
                for i in range(args.num_states):
                    cur_observations.append(list(observations)[2**i - 1])
                action = policy(
                    np.concatenate(list(cur_observations)).ravel().tolist())
            else:
                action = policy(observation)

            observation, reward, done, info = env.step(action)
            if args.use_more_states and observation:
                observations.appendleft(observation)

            episode_reward += reward
            episode_steps += 1
            step += 1

        if episode % 50 == 0 and episode != 0:
            print("saving models")
            os.makedirs("weights", exist_ok=True)
            agent.save_model("weights")
        episode += 1
        observation = None
        observations = None
        current_best_result = best_result.value
        best_episode_reward = max(episode_reward, best_episode_reward)
        best_result.value = max(episode_reward, current_best_result - 0.05)
        print(
            '#Ep{}: episode_reward:{:.3f} episode_steps:{} br: {:.3f} -> {:.3f}'
            .format(episode, episode_reward, episode_steps,
                    current_best_result, best_result.value))
Beispiel #6
0
class Policy_Domain:
    def __init__(self, observation_space, action_space):
        self.config = Config()
        self.agent_ddpg = DDPG(observation_space, action_space, self.config)
        if not (self.config.env == 'UAV'):
            self.agent_ddpg.load_weights(self.config.save_trained_models)
            self.agent_ddpg.eval()

        self.current_direct_wrong = 'north'
        self.min_distance_x = 50.0
        self.min_distance_y = 50.0

    def forward(self, state, time_step, args, reset_flag=False):
        if args.demo_type == 'uav':  # SenAvo
            if args.variance and args.prior_decay:
                coefs = [args.variance, args.variance]
                prior_decay = args.prior_decay  # prior_decay:为了减小先验策略,引入减小prior_sigma的因子
            else:
                coefs = [0.09, 0.09]
                prior_decay = 0.005
            time_step = torch.Tensor([time_step])[0]
            perspective = torch.atan(state[12] / state[13])
            first_perspective = torch.where(
                state[13] > 0,  # cos朝向角度 (度数,不是pi
                torch.where(state[12] > 0, perspective / np.pi * 180.0,
                            (perspective + 2 * np.pi) / np.pi * 180.0),
                (perspective + np.pi) / np.pi * 180.0)

            target = torch.atan(state[10] / state[11])  # 目标和自己的连线的角度信息
            position_target = torch.where(
                state[11] > 0,
                torch.where(state[10] > 0, target / np.pi * 180.0,
                            (target + 2 * np.pi) / np.pi * 180.0),
                (target + np.pi) / np.pi * 180.0)

            first_target = torch.remainder(  # 确定夹角  remainder(input,divisor) 返回一个新张量,包含输入input张量每个元素的除法余数,余数与除数有相同的符号。
                first_perspective - position_target, 360.0)

            average_direction = torch.where(  # 规范化夹角  torch.sign()输入一个张量,如果是正数返回1.0,负数返回-1.0。即如果夹角大于180则直接除以180,否则取互补的角度
                torch.sign(180.0 - first_target) + 1.0 > 0,
                -first_target / 180.0, (360.0 - first_target) / 180.0)
            variance_direction = 0.1 * average_direction + coefs[0]  # 0.1

            turning_free = torch.where(  # argmin:返回指定维度最小的编号。state[0~9]记录的基本方向上的距离信息。最小距离编号大于5(即即将碰撞的方向为左侧)则取前者(正向加45),否则后者
                torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0,
                45.0 + 0.1 * average_direction,
                -45.0 + 0.1 * average_direction)  # 0 0.1
            average_free = turning_free / 180.0  # 调整的方向
            variance_free = 0.1 * average_free + coefs[0]  # 0.1

            average_steer = torch.where(  # 最近距离是否大于碰撞距离。如果是不用转向,如果不是就调整方向
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                average_direction, average_free)
            variance_steer = torch.where(
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                variance_direction, variance_free)

            speed = state[14]
            average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5,
                                           0.5)

            variance_throttle = 0.1 * average_throttle + coefs[1]  # 0.1
            decay = prior_decay * (time_step - 1) + 1

            covariance = torch.cat(  # 按维数0拼接
                (variance_steer.unsqueeze_(0),
                 variance_throttle.unsqueeze_(0)), 0) * decay  # 公式(25)

            average = torch.cat(
                (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)),
                0)

        elif args.demo_type == 'uav_wrong':  # Naive
            if reset_flag:
                self.current_direct_wrong = 'north'
                self.min_distance_x = 50.0
                self.min_distance_y = 50.0

            if args:
                coefs = args.variance * 2
                prior_decay = args.prior_decay
            else:
                coefs = [0.09, 0.09]
                prior_decay = 0.005
            time_step = torch.Tensor([time_step])[0]
            perspective = torch.atan(state[12] / state[13])
            first_perspective = torch.where(
                state[13] > 0,
                torch.where(state[12] > 0, perspective / np.pi * 180.0,
                            (perspective + 2 * np.pi) / np.pi * 180.0),
                (perspective + np.pi) / np.pi * 180.0)

            target = torch.atan(state[10] / state[11])
            position_target = torch.where(
                state[11] > 0,
                torch.where(state[10] > 0, target / np.pi * 180.0,
                            (target + 2 * np.pi) / np.pi * 180.0),
                (target + np.pi) / np.pi * 180.0)

            distance = (state[9] / 2 + 0.5) * \
                (torch.sqrt(torch.Tensor([2])[0]) * 3000)

            distance_y = torch.abs(distance * torch.sin(
                2 * position_target / 360 * torch.Tensor([np.pi])[0]))
            distance_x = torch.abs(distance * torch.cos(
                2 * position_target / 360 * torch.Tensor([np.pi])[0]))

            if distance_y > self.min_distance_y:
                self.current_direct_wrong = 'north'
            elif distance_x > self.min_distance_x:
                if self.current_direct_wrong == 'north':
                    self.min_distance_x -= 5
                self.current_direct_wrong = 'east'
            else:
                if self.current_direct_wrong == 'east':
                    self.min_distance_y -= 5
                self.current_direct_wrong = 'north'

            if self.current_direct_wrong == 'north':
                if position_target > 0 and position_target < 180:
                    position_target = 90
                else:
                    position_target = 270

            else:
                if position_target < 90 or position_target > 270:
                    position_target = 0
                else:
                    position_target = 180

            first_target = torch.remainder(first_perspective - position_target,
                                           360.0)

            average_direction = torch.where(
                torch.sign(180.0 - first_target) + 1.0 > 0,
                -first_target / 180.0, (360.0 - first_target) / 180.0)
            variance_direction = 0.0 * average_direction + coefs[0]  # 0.1

            turning_free = torch.where(
                torch.sign(4 - torch.argmin(state[0:9]).float()) + 1.0 > 0,
                45.0 + 0 * average_direction, -45.0 + 0 * average_direction)

            average_free = turning_free / 180.0
            variance_free = 0.0 * average_free + coefs[0]  # 0.1
            average_steer = torch.where(
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                average_direction, average_free)
            variance_steer = torch.where(
                torch.sign(100 * torch.min(state[0:9]) - 15.0) + 1.0 > 0,
                variance_direction, variance_free)

            speed = state[14]
            average_throttle = torch.clamp(2.5 - 50 * (speed / 2 + 0.5), -0.5,
                                           0.5)
            variance_throttle = 0.0 * average_throttle + coefs[1]

            decay = prior_decay * (time_step - 1) + 1

            covariance = torch.cat(
                (variance_steer.unsqueeze_(0),
                 variance_throttle.unsqueeze_(0)), 0) * decay
            average = torch.cat(
                (average_steer.unsqueeze_(0), average_throttle.unsqueeze_(0)),
                0)
        else:
            average = self.agent_ddpg.select_action(state)  # 无策略则随便选
            time_step = torch.Tensor([time_step])[0]
            decay = args.prior_decay * (time_step - 1) + 1
            covariance = torch.ones(average.shape) * 0.1 * decay

        return average, covariance

    def action_sample(self, state, time_step, args):
        average, covariance = self.forward(state, time_step, args)
        eps = torch.Tensor(np.random.normal(0, 1, average.shape))
        action = average + eps * covariance.sqrt()
        return action