コード例 #1
0
    def predict(self, state):
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)

        self.last_state = self.transform(state)
        #print(self.last_state)

        action, feats = self.model(self.last_state[0].unsqueeze(0),
                                   self.last_state[1].unsqueeze(0))

        action = action.squeeze()
        action = ActionXY(
            action[0].item(),
            action[1].item()) if self.kinematics == 'holonomic' else ActionRot(
                action[0].item(), action[1].item())

        self.last_state = self.last_state, feats
        return action
コード例 #2
0
    def build_action_space(self, v_pref):
        """
        Action space consists of 25 uniformly sampled actions in permitted range and 25 randomly sampled actions.
        """
        holonomic = True if self.kinematics == 'holonomic' else False
        speeds = [(np.exp(
            (i + 1) / self.speed_samples) - 1) / (np.e - 1) * v_pref
                  for i in range(self.speed_samples)]
        if holonomic:
            rotations = np.linspace(0,
                                    2 * np.pi,
                                    self.rotation_samples,
                                    endpoint=False)
        else:
            rotations = np.linspace(-np.pi / 2, np.pi / 2,
                                    self.rotation_samples)

        action_space = [ActionXY(0, 0) if holonomic else ActionRot(0, 0)]
        for rotation, speed in itertools.product(rotations, speeds):
            if holonomic:
                action_space.append(
                    ActionXY(speed * np.cos(rotation),
                             speed * np.sin(rotation)))
            else:
                action_space.append(ActionRot(speed, rotation))

        self.speeds = speeds
        self.rotations = rotations
        self.action_space = action_space
コード例 #3
0
ファイル: TD3_rl.py プロジェクト: nubot-nudt/SG-DQN
 def predict(self, state):
     """
     A base class for all methods that takes pairwise joint state as input to value network.
     The input to the value network is always of shape (batch_size, # humans, rotated joint state length)
     """
     if self.phase is None or self.device is None:
         raise AttributeError('Phase, device attributes have to be set!')
     if self.phase == 'train' and self.epsilon is None:
         raise AttributeError('Epsilon attribute has to be set in training phase')
     if self.phase == 'train':
         self.last_state = self.transform(state)
     if self.reach_destination(state):
         return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
     state_tensor = state.to_tensor(add_batch_size=True, device=self.device)
     if self.phase == 'train':
         action = (
                 self.actor(state_tensor).squeeze().detach().numpy()
                 + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)
         ).clip(-self.max_action, self.max_action)
         speed = action[0]
         theta = action[1]
         Action = ActionXY(speed * np.cos(theta), speed * np.sin(theta)) \
             if self.kinematics == 'holonomic' else ActionRot(speed, theta)
         return Action, torch.tensor(action).float()
     else:
         with torch.no_grad():
             action = self.actor(state_tensor).squeeze().numpy()
             speed = action[0]
             theta = action[1]
             Action = ActionXY(speed * np.cos(theta), speed * np.sin(theta)) \
                 if self.kinematics == 'holonomic' else ActionRot(speed, theta)
         return Action, torch.tensor(action).float()
コード例 #4
0
    def build_action_space(self, v_pref):
        """
        Action space consists of 25 uniformly sampled actions in permitted range and 25 randomly sampled actions.
        """
        holonomic = True if self.kinematics == 'holonomic' else False
        # speeds = [(np.exp((i + 1) / self.speed_samples) - 1) / (np.e - 1) * v_pref for i in range(self.speed_samples)]
        speeds = [(i + 1) / self.speed_samples * v_pref for i in range(self.speed_samples)]
        if holonomic:
            rotations = np.linspace(0, 2 * np.pi, self.rotation_samples, endpoint=False)
        else:
            if self.rotation_constraint == np.pi:
                rotations = np.linspace(-self.rotation_constraint, self.rotation_constraint, self.rotation_samples, endpoint=False)
            else:
                rotations = np.linspace(-self.rotation_constraint, self.rotation_constraint, self.rotation_samples)

        action_space = [ActionXY(0, 0) if holonomic else ActionRot(0, 0)]
        self.action_group_index.append(0)
        for j, speed in enumerate(speeds):
            for i, rotation in enumerate(rotations):
                action_index = j * self.rotation_samples + i + 1
                self.action_group_index.append(action_index)
                if holonomic:
                    action_space.append(ActionXY(speed * np.cos(rotation), speed * np.sin(rotation)))
                else:
                    action_space.append(ActionRot(speed, rotation))
        self.speeds = speeds
        self.rotations = rotations
        self.action_space = action_space
コード例 #5
0
ファイル: visual_sim.py プロジェクト: hyzcn/VisualNav
    def _build_action_space(self):
        speeds = [(np.exp((i + 1) / self.speed_samples) - 1) / (np.e - 1) * self.max_speed for i in
                  range(self.speed_samples)]
        rotations = np.linspace(-np.pi / 4, np.pi / 4, self.rotation_samples)

        actions = [ActionRot(0, 0)]
        for rotation, speed in itertools.product(rotations, speeds):
            actions.append(ActionRot(speed, rotation))

        return actions
コード例 #6
0
    def build_action_space(self, v_pref):
        """
        Action space consists of 25 uniformly sampled actions in permitted range and 25 randomly sampled actions.
        """
        holonomic = True if self.kinematics == "holonomic" else False
        speeds = [(np.exp(
            (i + 1) / self.speed_samples) - 1) / (np.e - 1) * v_pref
                  for i in range(self.speed_samples)]
        if holonomic:
            rotations = np.linspace(0,
                                    2 * np.pi,
                                    self.rotation_samples,
                                    endpoint=False)
        else:
            rotations = np.linspace(
                -self.rotation_constraint,
                self.rotation_constraint,
                self.rotation_samples,
            )

        action_space = [ActionXY(0, 0) if holonomic else ActionRot(0, 0)]
        for j, speed in enumerate(speeds):
            if j == 0:
                # index for action (0, 0)
                self.action_group_index.append(0)
            # only two groups in speeds
            if j < 3:
                speed_index = 0
            else:
                speed_index = 1

            for i, rotation in enumerate(rotations):
                rotation_index = i // 2

                action_index = (speed_index * self.sparse_rotation_samples +
                                rotation_index)
                self.action_group_index.append(action_index)

                if holonomic:
                    action_space.append(
                        ActionXY(speed * np.cos(rotation),
                                 speed * np.sin(rotation)))
                else:
                    action_space.append(ActionRot(speed, rotation))

        self.speeds = speeds
        self.rotations = rotations
        self.action_space = action_space
コード例 #7
0
ファイル: srnn.py プロジェクト: Shuijing725/CrowdNav_DSRNN
    def clip_action(self, raw_action, v_pref):
        """
        Input state is the joint state of robot concatenated by the observable state of other agents

        To predict the best action, agent samples actions and propagates one step to see how good the next state is
        thus the reward function is needed

        """
        # quantize the action
        holonomic = True if self.config.action_space.kinematics == 'holonomic' else False
        # clip the action
        if holonomic:
            act_norm = np.linalg.norm(raw_action)
            if act_norm > v_pref:
                raw_action[0] = raw_action[0] / act_norm * v_pref
                raw_action[1] = raw_action[1] / act_norm * v_pref
            return ActionXY(raw_action[0], raw_action[1])
        else:
            # for sim2real
            raw_action[0] = np.clip(raw_action[0], -0.1,
                                    0.1)  # action[0] is change of v
            # raw[0, 1] = np.clip(raw[0, 1], -0.25, 0.25) # action[1] is change of w
            # raw[0, 0] = np.clip(raw[0, 0], -state.self_state.v_pref, state.self_state.v_pref) # action[0] is v
            raw_action[1] = np.clip(raw_action[1], -0.1,
                                    0.1)  # action[1] is change of theta

            return ActionRot(raw_action[0], raw_action[1])
コード例 #8
0
 def act(self, obs):
     robot_state, humans_state, local_map = obs
     state = JointState(robot_state, humans_state)
     action = self.policy.predict(state, local_map, None)
     action = ActionRot(robot_state.v_pref * action.v,
                        action.r)  # de-normalize
     return action
コード例 #9
0
    def predict(self, state):
        """
        Input state is the joint state of robot concatenated by the observable state of other agents

        To predict the best action, agent samples actions and propagates one step to see how good the next state is
        thus the reward function is needed

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')
        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)

        host_agent = agent.Agent(state.self_state.px, state.self_state.py,
                                 state.self_state.gx, state.self_state.gy,
                                 state.self_state.radius,
                                 state.self_state.v_pref,
                                 state.self_state.theta, 0)
        host_agent.vel_global_frame = np.array(
            [state.self_state.vx, state.self_state.vy])

        other_agents = []
        for i, human_state in enumerate(state.human_states):
            if self.human_state_in_FOV(state.self_state, human_state):
                x = human_state.px
                y = human_state.py
                v_x = human_state.vx
                v_y = human_state.vy
                heading_angle = np.arctan2(v_y, v_x)
                pref_speed = np.linalg.norm(np.array([v_x, v_y]))
                goal_x = x + 5.0
                goal_y = y + 5.0
                other_agents.append(
                    agent.Agent(x, y, goal_x, goal_y, human_state.radius,
                                pref_speed, heading_angle, i + 1))
                other_agents[-1].vel_global_frame = np.array([v_x, v_y])
        obs = host_agent.observe(other_agents)[1:]
        obs = np.expand_dims(obs, axis=0)
        predictions = self.net.predict_p(obs, None)[0]
        raw_action = self.possible_actions.actions[np.argmax(predictions)]
        action = ActionRot(host_agent.pref_speed * raw_action[0],
                           util.wrap(raw_action[1]))
        return action
コード例 #10
0
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.robot_state.v_pref)

        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(
                len(self.action_space))]
        else:
            max_action = None
            max_value = float('-inf')
            max_traj = None

            if self.do_action_clip:
                state_tensor = state.to_tensor(add_batch_size=True,
                                               device=self.device)
                action_space_clipped = self.action_clip(
                    state_tensor, self.action_space, self.planning_width)
            else:
                action_space_clipped = self.action_space

            for action in action_space_clipped:
                state_tensor = state.to_tensor(add_batch_size=True,
                                               device=self.device)
                next_state = self.state_predictor(state_tensor, action)
                max_next_return, max_next_traj = self.V_planning(
                    next_state, self.planning_depth, self.planning_width)
                reward_est = self.estimate_reward(state, action)
                value = reward_est + self.get_normalized_gamma(
                ) * max_next_return
                if value > max_value:
                    max_value = value
                    max_action = action
                    max_traj = [(state_tensor, action, reward_est)
                                ] + max_next_traj
            if max_action is None:
                raise ValueError('Value network is not well trained.')

        if self.phase == 'train':
            self.last_state = self.transform(state)
        else:
            self.traj = max_traj

        return max_action
コード例 #11
0
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError('Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.self_state.v_pref)

        occupancy_maps = None
        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(len(self.action_space))]
        else:
            self.action_values = list()
            max_value = float('-inf')
            max_action = None
            for action in self.action_space:
                next_self_state = self.propagate(state.self_state, action)
                if self.query_env:
                    next_human_states, reward, done, info = self.env.onestep_lookahead(action)
                else:
                    next_human_states = [self.propagate(human_state, ActionXY(human_state.vx, human_state.vy))
                                       for human_state in state.human_states]
                    reward = self.compute_reward(next_self_state, next_human_states)
                batch_next_states = torch.cat([torch.Tensor([next_self_state + next_human_state]).to(self.device)
                                              for next_human_state in next_human_states], dim=0)
                #print('multi human_rl --- line 45  ')
                #print('prediction phase! ----> in RL ----')
                #print('rotation is callled ---------------------------------------------------------------------------------------------------------------------!')
                rotated_batch_input = self.rotate(batch_next_states).unsqueeze(0)
                if self.with_om:
                    if occupancy_maps is None:
                        occupancy_maps = self.build_occupancy_maps(next_human_states).unsqueeze(0)
                    rotated_batch_input = torch.cat([rotated_batch_input, occupancy_maps], dim=2)
                # VALUE UPDATE
                next_state_value = self.model(rotated_batch_input).data.item()
                value = reward + pow(self.gamma, self.time_step * state.self_state.v_pref) * next_state_value
                self.action_values.append(value)
                if value > max_value:
                    max_value = value
                    max_action = action
            if max_action is None:
                raise ValueError('Value network is not well trained. ')

        if self.phase == 'train':
            #print('---------multi_human_rl.py  line 64   ----->transformation done!')
            self.last_state = self.transform(state)

        return max_action
コード例 #12
0
    def predict(self, state):
        """
        Input state is the joint state of robot concatenated by the observable state of other agents

        To predict the best action, agent samples actions and propagates one step to see how good the next state is
        thus the reward function is needed

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.self_state.v_pref)

        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(
                len(self.action_space))]
        else:
            self.action_values = list()
            max_min_value = float('-inf')
            max_action = None
            for action in self.action_space:
                next_self_state = self.propagate(state.self_state, action)
                ob, reward, done, info = self.env.onestep_lookahead(action)
                batch_next_states = torch.cat([
                    torch.Tensor([next_self_state + next_human_state]).to(
                        self.device) for next_human_state in ob
                ],
                                              dim=0)
                # VALUE UPDATE
                outputs = self.model(
                    self.rotate(batch_next_states))  # Successor feature
                outputs = (outputs * self.model.w_vec).sum(1)
                min_output, min_index = torch.min(outputs, 0)
                min_value = reward + pow(
                    self.gamma, self.time_step *
                    state.self_state.v_pref) * min_output.data.item()
                self.action_values.append(min_value)
                if min_value > max_min_value:
                    max_min_value = min_value
                    max_action = action

        if self.phase == 'train':
            self.last_state = self.transform(state)
        # print("max action: ", max_action)

        return max_action
コード例 #13
0
ファイル: cadrl.py プロジェクト: ChanganVR/MultiagentRGL
    def select_greedy_action(self, self_state):
        # find the greedy action given kinematic constraints and return the closest action in the action space
        direction = np.arctan2(self_state.gy - self_state.py,
                               self_state.gx - self_state.px)
        distance = np.linalg.norm(
            (self_state.gy - self_state.py, self_state.gx - self_state.px))
        if self.kinematics == 'holonomic':
            speed = min(distance / self.time_step, self_state.v_pref)
            vx = np.cos(direction) * speed
            vy = np.sin(direction) * speed

            min_diff = float('inf')
            closest_action = None
            for action in self.action_space:
                diff = np.linalg.norm(np.array(action) - np.array((vx, vy)))
                if diff < min_diff:
                    min_diff = diff
                    closest_action = action
        else:
            rotation = direction - self_state.theta
            # if goal is not in the field of view, always rotate first
            if rotation < self.rotations[0]:
                closest_action = ActionRot(self.speeds[0], self.rotations[0])
            elif rotation > self.rotations[-1]:
                closest_action = ActionRot(self.speeds[0], self.rotations[-1])
            else:
                speed = min(distance / self.time_step, self_state.v_pref)

                min_diff = float('inf')
                closest_action = None
                for action in self.action_space:
                    diff = np.linalg.norm(
                        np.array((np.cos(action.r) * action.v,
                                  np.sin(action.r) * action.v)) -
                        np.array((np.cos(rotation) * speed),
                                 np.sin(rotation) * action.v))
                    if diff < min_diff:
                        min_diff = diff
                        closest_action = action

        return closest_action
コード例 #14
0
ファイル: transformations_test.py プロジェクト: EcustBoy/SCR
    def test_non_holonomic_left_movement(self):
        radius = 1
        time_step = .1
        state = FullState(0, 0, 0, 0, radius, 0, 0, 0, 0)
        r = np.pi
        action = ActionRot(1., r)
        next_state = propagate(state,
                               action,
                               time_step=time_step,
                               kinematics='non_holonomic')
        expected_state = FullState(time_step * np.cos(r), time_step * np.sin(r), \
                                         np.cos(r), np.sin(r), radius, 0, 0, 0, r)

        self.assertEqual(expected_state, next_state)
コード例 #15
0
    def action_clip(self, state, action_space, width, depth=1):
        values = []
        actions = []
        if self.kinematics == "holonomic":
            actions.append(ActionXY(0, 0))
        else:
            actions.append(ActionRot(0, 0))
        # actions.append(ActionXY(0, 0))
        next_robot_states = None
        next_human_states = None
        pre_next_state = self.state_predictor(state, actions)
        for action in action_space:
            # actions = []
            # actions.append(action)
            next_robot_state = self.compute_next_robot_state(state[0], action)
            next_human_state = pre_next_state[1]
            if next_robot_states is None and next_human_states is None:
                next_robot_states = next_robot_state
                next_human_states = next_human_state
            else:
                next_robot_states = torch.cat((next_robot_states, next_robot_state), dim=0)
                next_human_states = torch.cat((next_human_states, next_human_state), dim=0)
            next_state_tensor = (next_robot_state, next_human_state)
            next_state = tensor_to_joint_state(next_state_tensor)
            reward_est, _ = self.reward_estimator.estimate_reward_on_predictor(state, next_state)
            values.append(reward_est)
        next_return = self.value_estimator((next_robot_states, next_human_states)).squeeze()
        next_return = np.array(next_return.data.detach())
        values = np.array(values) + self.get_normalized_gamma() * next_return
        values = values.tolist()

        if self.sparse_search:
            # self.sparse_speed_samples = 2
            # search in a sparse grained action space
            added_groups = set()
            max_indices = np.argsort(np.array(values))[::-1]
            clipped_action_space = []
            for index in max_indices:
                if self.action_group_index[index] not in added_groups:
                    clipped_action_space.append(action_space[index])
                    added_groups.add(self.action_group_index[index])
                    if len(clipped_action_space) == width:
                        break
        else:
            max_indexes = np.argpartition(np.array(values), -width)[-width:]
            clipped_action_space = [action_space[i] for i in max_indexes]

        # print(clipped_action_space)
        return clipped_action_space
コード例 #16
0
    def reset(self):
        """ IANEnv destroys and re-creates its iarlenv at every reset, so apply our changes here """
        self.env.reset()

        # we raytrace at a higher resolution, then downsample back to the original soadrl resolution
        # this avoids missing small obstacles due to the small soadrl resolution
        self.env.iarlenv.rlenv.virtual_peppers[
            0].kLidarMergedMaxAngle = self.angular_map_max_angle
        self.env.iarlenv.rlenv.virtual_peppers[
            0].kLidarMergedMinAngle = self.angular_map_min_angle
        self.env.iarlenv.rlenv.virtual_peppers[0].kLidarAngleIncrement = \
            self.angular_map_angle_increment / self.lidar_upsampling
        self.env.iarlenv.rlenv.kMergedScanSize = self.angular_map_dim * self.lidar_upsampling

        self.episode_statistics = self.env.episode_statistics

        obs, _, _, _ = self.step(ActionRot(0., 0.))
        return obs
コード例 #17
0
ファイル: tree_searchrl.py プロジェクト: nubot-nudt/SG-DQN
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        self.count = self.count + 1
        # if self.count == 34:
        #     print('debug')
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')
        # self.v_pref = state.robot_state.v_pref
        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(self.v_pref)
        max_action = None
        origin_max_value = float('-inf')
        state_tensor = state.to_tensor(add_batch_size=True, device=self.device)
        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon and self.use_noisy_net is False:
            max_action_index = np.random.choice(len(self.action_space))
            max_action = self.action_space[max_action_index]
            self.last_state = self.transform(state)
            return max_action, max_action_index
        else:
            max_value, max_action_index, max_traj = self.V_planning(
                state_tensor, self.planning_depth, self.planning_width)
            if max_value[0] > origin_max_value:
                max_action = self.action_space[max_action_index[0]]
            if max_action is None:
                raise ValueError('Value network is not well trained.')

        if self.phase == 'train':
            self.last_state = self.transform(state)
        else:
            self.last_state = self.transform(state)
            self.traj = max_traj[0]
        return max_action, int(max_action_index[0])
コード例 #18
0
    def step(self, action, update=True):
        """
        Compute actions for all agents, detect collision, update environment and return (ob, reward, done, info)
        """
        action = self.robot.policy.clip_action(action, self.robot.v_pref)

        if self.robot.kinematics == 'unicycle':
            self.desiredVelocity[0] = np.clip(self.desiredVelocity[0]+action.v,-self.robot.v_pref,self.robot.v_pref)
            action=ActionRot(self.desiredVelocity[0], action.r)


        human_actions = self.get_human_actions()

        # compute reward and episode info
        reward, done, episode_info = self.calc_reward(action)


        # apply action and update all agents
        self.robot.step(action)
        for i, human_action in enumerate(human_actions):
            self.humans[i].step(human_action)
        self.global_time += self.time_step # max episode length=time_limit/time_step


        # compute the observation
        ob = self.generate_ob(reset=False)

        info={'info':episode_info}


        # Update all humans' goals randomly midway through episode
        if self.random_goal_changing:
            if self.global_time % 5 == 0:
                self.update_human_goals_randomly()
        
        # Update a specific human's goal once its reached its original goal
        if self.end_goal_changing:
            for human in self.humans:
                if norm((human.gx - human.px, human.gy - human.py)) < human.radius:
                    self.update_human_goal(human)


        return ob, reward, done, info
コード例 #19
0
    def V_planning(self, state, depth, width):
        """ Plans n steps into future. Computes the value for the current state as well as the trajectories
        defined as a list of (state, action, reward) triples

        """

        current_state_value = self.value_estimator(state)
        if depth == 1:
            return current_state_value, [(state, None, None)]

        if self.do_action_clip:
            action_space_clipped = self.action_clip(state, self.action_space, width)
        else:
            action_space_clipped = self.action_space

        returns = []
        trajs = []
        actions =[]
        if self.kinematics == "holonomic":
            actions.append(ActionXY(0, 0))
        else:
            actions.append(ActionRot(0, 0))
        # actions.append(ActionXY(0, 0))
        pre_next_state = self.state_predictor(state, actions)
        for action in action_space_clipped:
            next_robot_staete = self.compute_next_robot_state(state[0], action)
            next_state_est = next_robot_staete, pre_next_state[1]
            # reward_est = self.estimate_reward(state, action)
            reward_est, _ = self.reward_estimator.estimate_reward_on_predictor(tensor_to_joint_state(state),
                                                                               tensor_to_joint_state(next_state_est))
            next_value, next_traj = self.V_planning(next_state_est, depth - 1, self.planning_width)
            return_value = current_state_value / depth + (depth - 1) / depth * (self.get_normalized_gamma() * next_value + reward_est)
            returns.append(return_value)
            trajs.append([(state, action, reward_est)] + next_traj)
        max_index = np.argmax(returns)
        max_return = returns[max_index]
        max_traj = trajs[max_index]
        return max_return, max_traj
コード例 #20
0
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        if self.phase is None or self.device is None:
            raise AttributeError("Phase, device attributes have to be set!")
        if self.phase == "train" and self.epsilon is None:
            raise AttributeError("Epsilon attribute has to be set in training phase")

        if self.reach_destination(state):
            return ActionXY(0, 0) if self.kinematics == "holonomic" else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.robot_state.v_pref)
        if not state.human_states:
            assert self.phase != "train"
            if hasattr(self, "attention_weights"):
                self.attention_weights = list()
            return self.select_greedy_action(state.robot_state)

        occupancy_maps = None
        probability = np.random.random()
        if self.phase == "train" and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(len(self.action_space))]
        else:
            self.action_values = list()
            max_value = float("-inf")
            max_action = None
            for action in self.action_space:
                next_robot_state = self.propagate(state.robot_state, action)
                if self.query_env:
                    next_human_states, reward, done, info = self.env.onestep_lookahead(
                        action
                    )
                else:
                    next_human_states = [
                        self.propagate(
                            human_state, ActionXY(human_state.vx, human_state.vy)
                        )
                        for human_state in state.human_states
                    ]
                    reward = self.compute_reward(next_robot_state, next_human_states)
                batch_next_states = torch.cat(
                    [
                        torch.Tensor([next_robot_state + next_human_state]).to(
                            self.device
                        )
                        for next_human_state in next_human_states
                    ],
                    dim=0,
                )
                rotated_batch_input = self.rotate(batch_next_states).unsqueeze(0)
                if self.with_om:
                    if occupancy_maps is None:
                        occupancy_maps = self.build_occupancy_maps(
                            next_human_states
                        ).unsqueeze(0)
                    rotated_batch_input = torch.cat(
                        [rotated_batch_input, occupancy_maps], dim=2
                    )
                # VALUE UPDATE
                next_state_value = self.model(rotated_batch_input).data.item()
                value = (
                    reward
                    + pow(self.gamma, self.time_step * state.robot_state.v_pref)
                    * next_state_value
                )
                self.action_values.append(value)
                if value > max_value:
                    max_value = value
                    max_action = action
                    if hasattr(self, "attention_weights"):
                        self.attention_weights = self.model.attention_weights
            if max_action is None:
                raise ValueError("Value network is not well trained. ")

        if self.phase == "train":
            self.last_state = self.transform(state)

        return max_action
コード例 #21
0
ファイル: tlsgan.py プロジェクト: romi514/TLSGAN
    def predict(self, states):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        states is a list of joint states

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(states[-1]):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(states[-1].self_state.v_pref)

        if self.policy_learning:

            trajs, rel_trajs, self_info = stateToTrajTensors([states])
            next_position = self.model(
                (trajs, rel_trajs, self_info)).data.squeeze(0)
            action = ActionXY(next_position[0], next_position[1])
            return action

        else:
            occupancy_maps = None
            probability = np.random.random()
            if self.phase == 'train' and probability < self.epsilon:
                max_action = self.action_space[np.random.choice(
                    len(self.action_space))]
            else:
                self.action_values = list()
                max_value = float('-inf')
                max_action = None
                for action in self.action_space:
                    next_self_state = self.propagate(
                        states[-1].self_state, action
                    )  # 'px', 'py', 'vx', 'vy', 'radius', 'gx', 'gy', 'v_pref', 'theta'
                    if self.query_env:
                        next_human_states, reward, done, info = self.env.onestep_lookahead(
                            action)
                    else:
                        next_human_states = [
                            self.propagate(
                                human_state,
                                ActionXY(human_state.vx, human_state.vy))
                            for human_state in states.human_states
                        ]  # 'px1', 'py1', 'vx1', 'vy1', 'radius1'
                        reward = self.compute_reward(next_self_state,
                                                     next_human_states)

                    # VALUE UPDATE
                    trajs, rel_trajs, self_info = stateToTrajTensors([
                        states +
                        [JointState(next_self_state, next_human_states)]
                    ])
                    next_state_value = self.model(
                        (trajs, rel_trajs, self_info)).data.item()
                    value = reward + pow(
                        self.gamma, self.time_step *
                        states[-1].self_state.v_pref) * next_state_value
                    self.action_values.append(value)
                    if value > max_value:
                        max_value = value
                        max_action = action
                if max_action is None:
                    raise ValueError('Value network is not well trained. ')

        if self.phase == 'train':
            self.last_state = states

        return max_action
コード例 #22
0
    def predict(self, state, joint_xy_history, occ_history):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.self_state.v_pref)

        occupancy_maps = None
        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(
                len(self.action_space))]
        else:
            self.action_values = list()
            max_value = float('-inf')
            max_action = None
            for action in self.action_space:
                next_self_state = self.propagate(state.self_state, action)
                if self.query_env:  #mamoolan true
                    #print("query env Tru")
                    next_human_states, reward, done, info = self.env.onestep_lookahead(
                        action)
                else:
                    print("not!!!!! query env")
                    next_human_states = [
                        self.propagate(
                            human_state,
                            ActionXY(human_state.vx, human_state.vy))
                        for human_state in state.human_states
                    ]
                    reward = self.compute_reward(next_self_state,
                                                 next_human_states)
                batch_next_states = torch.cat([
                    torch.Tensor([next_self_state + next_human_state]).to(
                        self.device) for next_human_state in next_human_states
                ],
                                              dim=0)  # ans =[n-human,14]
                rotated_batch_input = self.rotate(batch_next_states).unsqueeze(
                    0)
                if self.with_om:
                    if occupancy_maps is None:
                        occupancy_maps = self.build_occupancy_maps(
                            next_human_states).unsqueeze(0)
                    rotated_batch_input = torch.cat(
                        [rotated_batch_input,
                         occupancy_maps.to(self.device)],
                        dim=2)

                # VALUE UPDATE
                if (len(joint_xy_history.size()) == 3
                    ):  #hatman bayad information next ra ezafe konim inja
                    joint_xy_history = joint_xy_history.unsqueeze(
                        0)  #[1 hist-len n-human+1 2(xy)]
                    occ_history = occ_history.unsqueeze(0)  #[1 hist-1 32]

                human_xy_next = batch_next_states[:, 9:11]
                robot_xy_next = batch_next_states[0:1, 0:2]
                joint_xy_next = torch.cat([robot_xy_next, human_xy_next],
                                          dim=-2)  #[ n-human+1, 2]

                joint_xy_current = joint_xy_history[:, -1, :, :].squeeze(
                    0)  #ans = [n-human+1 ,2(xy)]

                last_occ = self.pool(None, joint_xy_current,
                                     joint_xy_next).unsqueeze(0)  #[1, 1 ,32]

                joint_xy_history_plus = torch.cat(
                    [
                        joint_xy_history[:, 1:, :, :],
                        joint_xy_next.unsqueeze(0).unsqueeze(0)
                    ],
                    dim=-3)  #[batch ,hist-len ,n-human+1,2(xy)]
                occ_history_plus = torch.cat([occ_history[:, 1:, :], last_occ],
                                             dim=-2)
                representation_env_plus = self.traj_model(
                    joint_xy_history_plus, occ_history_plus
                )[0].data  # joint_xy=[batch, hist_len, n-human+1,2(xy)]  // occ_history=[100,hist_len-1,32] //ans=[100 128]

                next_state_value = self.model(
                    rotated_batch_input, representation_env_plus).data.item()

                value = reward + pow(
                    self.gamma, self.time_step *
                    state.self_state.v_pref) * next_state_value
                self.action_values.append(value)
                if value > max_value:
                    max_value = value
                    max_action = action
            if max_action is None:
                raise ValueError('Value network is not well trained. ')

        if self.phase == 'train':
            self.last_state = self.transform(state)
            self.last_state_unrotated = torch.cat(
                [
                    torch.Tensor([state.self_state + human_state]).to(
                        self.device
                    )  # added by saleh : because now we are in the RL phase and we need to push unrotated_state in the memory
                    for human_state in state.human_states
                ],
                dim=0)

        return max_action
コード例 #23
0
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError('Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.robot_state.v_pref)

        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action_index = np.random.choice(len(self.action_space))
            max_action = self.action_space[max_action_index]
        else:
            max_action = None
            max_value = float('-inf')
            max_traj = None

            if self.do_action_clip:
                state_tensor = state.to_tensor(add_batch_size=True, device=self.device)
                action_space_clipped = self.action_clip(state_tensor, self.action_space, self.planning_width)
            else:
                action_space_clipped = self.action_space
            state_tensor = state.to_tensor(add_batch_size=True, device=self.device)
            actions = []
            if self.kinematics == "holonomic":
                actions.append(ActionXY(0, 0))
            else:
                actions.append(ActionRot(0, 0))
            # actions.append(ActionXY(0, 0))
            pre_next_state = self.state_predictor(state_tensor, actions)
            next_robot_states = None
            next_human_states = None
            next_value = []
            rewards = []
            for action in action_space_clipped:
                next_robot_state = self.compute_next_robot_state(state_tensor[0], action)
                next_human_state = pre_next_state[1]
                if next_robot_states is None and next_human_states is None:
                    next_robot_states = next_robot_state
                    next_human_states = next_human_state
                else:
                    next_robot_states = torch.cat((next_robot_states, next_robot_state), dim=0)
                    next_human_states = torch.cat((next_human_states, next_human_state), dim=0)
                next_state = tensor_to_joint_state((next_robot_state, next_human_state))
                reward_est, _ = self.reward_estimator.estimate_reward_on_predictor(state, next_state)
                max_next_return, max_next_traj = self.V_planning((next_robot_state, next_human_state), self.planning_depth, self.planning_width)
                value = reward_est + self.get_normalized_gamma() * max_next_return
                if value > max_value:
                    max_value = value
                    max_action = action
                    max_traj = [(state_tensor, action, reward_est)] + max_next_traj
                # reward_est = self.estimate_reward(state, action)
                # rewards.append(reward_est)
                # next_state = self.state_predictor(state_tensor, action)
            # rewards_tensor = torch.tensor(rewards).to(self.device)
            # next_state_batch = (next_robot_states, next_human_states)
            # next_value = self.value_estimator(next_state_batch).squeeze(1)
            # value = rewards_tensor + next_value * self.get_normalized_gamma()
            # max_action_index = value.argmax()
            # best_value = value[max_action_index]
            # if best_value > max_value:
            #     max_action = action_space_clipped[max_action_index]
            #
            #     next_state = tensor_to_joint_state((next_robot_states[max_action_index], next_human_states[max_action_index]))
            #     max_next_traj = [(next_state.to_tensor(), None, None)]
            #     # max_next_return, max_next_traj = self.V_planning(next_state, self.planning_depth, self.planning_width)
            #     # reward_est = self.estimate_reward(state, action)
            #     # value = reward_est + self.get_normalized_gamma() * max_next_return
            #     # if value > max_value:
            #     #     max_value = value
            #     #     max_action = action
            #     max_traj = [(state_tensor, max_action, rewards[max_action_index])] + max_next_traj
            if max_action is None:
                raise ValueError('Value network is not well trained.')

        if self.phase == 'train':
            self.last_state = self.transform(state)
        else:
            self.traj = max_traj
        for action_index in range(len(self.action_space)):
            action = self.action_space[action_index]
            if action is max_action:
                max_action_index = action_index
                break
        return max_action, int(max_action_index)
コード例 #24
0
    def run_k_episodes(self,
                       k,
                       phase,
                       update_memory=False,
                       imitation_learning=False,
                       episode=None,
                       traj_head=None,
                       print_failure=False,
                       noise_explore=0.0,
                       progressbar=False,
                       output_info=False,
                       notebook=False,
                       print_info=False,
                       return_states=False,
                       suffix=None):
        self.robot.policy.set_phase(phase)
        success_times = []
        collision_times = []
        timeout_times = []
        success = 0
        collision = 0
        timeout = 0
        too_close = 0
        min_dist = []
        cumulative_rewards = []
        collision_cases = []
        timeout_cases = []

        states_s = []

        if imitation_learning: human_obsv = []

        iter = range(k)
        if progressbar:
            if notebook:
                iter = tqdm_notebook(iter, leave=False)
            else:
                iter = tqdm(iter, leave=False)
        traj_acc_sum = 0
        for i in iter:
            ob = self.env.reset(phase)
            done = False
            states = []
            actions = []
            rewards = []
            scene = []

            while not done:

                # infer action from policy
                if 'TRAJ' in self.robot.policy.name:
                    action = self.robot.act(ob)
                elif self.num_frames == 1:
                    action = self.robot.act(ob)
                else:
                    last_ob = latest_frame(ob, self.num_frames)
                    action = self.robot.act(last_ob)
                actions.append(action)

                # state append
                if imitation_learning:
                    # if 'RNN' in self.target_policy.name:
                    #     # TODO: enumerate and test differenct combinations of policies
                    #     joint_state = JointState(self.robot.get_full_state(), ob)
                    # elif 'SAIL' in self.target_policy.name:
                    #     joint_state = JointState(self.robot.get_full_state(), ob)
                    # else:
                    #     joint_state = self.robot.policy.last_state

                    joint_state = JointState(self.robot.get_full_state(), ob)
                    last_state = ExplorerDs.transform_mult(joint_state)
                    states.append(last_state)
                    scene.append(obs_to_frame(ob))
                else:
                    states.append(self.robot.policy.last_state)

                # env step
                if imitation_learning and noise_explore > 0.0:
                    noise_magnitude = noise_explore * 2.0
                    action = ActionXY(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude
                    ) if self.robot.policy.kinematics == 'holonomic' else ActionRot(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude)

                ob, reward, done, info = self.env.step(action)
                rewards.append(reward)

                if isinstance(info, Danger):
                    too_close += 1
                    min_dist.append(info.min_dist)

            if imitation_learning: human_obsv.append(torch.FloatTensor(scene))

            if isinstance(info, ReachGoal):
                success += 1
                success_times.append(self.env.global_time)
            elif isinstance(info, Collision):
                collision += 1
                collision_cases.append(i)
                collision_times.append(self.env.global_time)
            elif isinstance(info, Timeout):
                timeout += 1
                timeout_cases.append(i)
                timeout_times.append(self.env.time_limit)
            else:
                raise ValueError('Invalid end signal from environment')
            traj_accuracy = 0
            if traj_head is not None:
                for i, ([rob, hum], human_feats) in enumerate(states):
                    #next_human_pos = torch.cat([states[max(i, len(states)-1)][0][1] for i in range(len(states))], dim=1)
                    next_human_pos = []
                    for j in range(4):
                        n_p = states[min(i + j + 1,
                                         len(states) - 1)][0][1][-1, :, :2]
                        #print(states[min(i+j+1, len(states)-1)][0][1])
                        next_human_pos.append(n_p)
                    with torch.no_grad():
                        next_human_pos = torch.cat(next_human_pos, dim=1)
                        next_human_pos_pred = traj_head(human_feats)

                        # print(hum[-1])
                        # print(next_human_pos)
                        # print(next_human_pos_pred)
                        # print('\n')
                        accuracy = ((next_human_pos -
                                     next_human_pos_pred)**2).mean().item()
                        traj_accuracy += accuracy
                    #print(traj_accuracy)
            traj_acc_sum += traj_accuracy

            if update_memory:
                self.update_memory(states, actions, rewards,
                                   imitation_learning)

            states_s.append(states)

            # episode result
            cumulative_rewards.append(
                sum([
                    pow(self.gamma,
                        t * self.robot.time_step * self.robot.v_pref) * reward
                    for t, reward in enumerate(rewards)
                ]))
            # if i % 100 == 0 and i > 0:
            #     logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k))
        traj_acc_sum = traj_acc_sum / sum(len(s) for s in states_s)
        # episodes statistics
        success_rate = success / k
        collision_rate = collision / k
        timeout_rate = timeout / k
        assert success + collision + timeout == k
        avg_nav_time = sum(success_times) / len(
            success_times) if success_times else self.env.time_limit
        if not print_info:
            extra_info = '' if episode is None else 'in episode {} '.format(
                episode)
            logging.info(
                '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}'
                .format(phase.upper(), extra_info, success_rate,
                        collision_rate, avg_nav_time,
                        statistics.mean(cumulative_rewards),
                        (statistics.stdev(cumulative_rewards)
                         if len(cumulative_rewards) > 1 else 0.0)))
        info = {
            'success': success_rate,
            'collision': collision_rate,
            'nav time': avg_nav_time,
            'reward': statistics.mean(cumulative_rewards)
        }
        if print_info:
            info = 'success: {:.4f},collision: {:.4f},nav time: {:.2f},reward: {:.4f}, timeout : {:.4f}'.\
            format(success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), timeout_rate)
            if traj_head is not None:
                info += ', traj accuracy : {:.4f}'.format(traj_acc_sum)
            if suffix is not None:
                info = suffix + info
            print(info)
        # if phase in ['val', 'test']:
        #     total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step
        #     logging.info('Frequency of being in danger: %.2f', too_close / total_time)

        if print_failure:
            logging.info('Collision cases: ' +
                         ' '.join([str(x) for x in collision_cases]))
            logging.info('Timeout cases: ' +
                         ' '.join([str(x) for x in timeout_cases]))

        info_dic = {
            'success': success_rate,
            'collision': collision_rate,
            'nav time': avg_nav_time,
            'reward': statistics.mean(cumulative_rewards),
            'traj accuracy': traj_acc_sum
        }

        if output_info:
            if return_states:
                return info_dic, states_s
            else:
                return info_dic

        if imitation_learning:
            return human_obsv
コード例 #25
0
    def run_k_episodes(self,
                       k,
                       phase,
                       update_memory=False,
                       imitation_learning=False,
                       episode=None,
                       print_failure=False,
                       noise_explore=0.0,
                       progressbar=True,
                       notebook=False):
        self.robot.policy.set_phase(phase)
        success_times = []
        collision_times = []
        timeout_times = []
        success = 0
        collision = 0
        timeout = 0
        too_close = 0
        min_dist = []
        cumulative_rewards = []
        collision_cases = []
        timeout_cases = []

        if imitation_learning: human_obsv = []

        iter = range(k)
        if progressbar:
            if notebook:
                iter = tqdm_notebook(iter, leave=False)
            else:
                iter = tqdm(iter, leave=False)

        for i in iter:
            ob = self.env.reset(phase)
            done = False
            states = []
            actions = []
            rewards = []
            scene = []

            #print(self.robot.policy)

            while not done:

                # infer action from policy
                if 'RNN' in self.robot.policy.name:
                    action = self.robot.act(ob)
                elif self.num_frames == 1:
                    action = self.robot.act(ob)
                else:
                    last_ob = latest_frame(ob, self.num_frames)
                    action = self.robot.act(last_ob)
                actions.append(action)
                #print(self.target_policy.name)
                # state append
                if imitation_learning:
                    if 'RNN' in self.target_policy.name:
                        # TODO: enumerate and test differenct combinations of policies
                        joint_state = JointState(self.robot.get_full_state(),
                                                 ob)
                    elif 'SAIL' in self.target_policy.name or 'SARL' in self.target_policy.name:

                        joint_state = JointState(self.robot.get_full_state(),
                                                 ob)
                    else:
                        joint_state = self.robot.policy.last_state
                    last_state = self.target_policy.transform(joint_state)
                    states.append(last_state)
                    scene.append(obs_to_frame(ob))
                    #print(last_state)
                else:
                    states.append(self.robot.policy.last_state)

                # env step
                if imitation_learning and noise_explore > 0.0:
                    noise_magnitude = noise_explore * 2.0
                    action = ActionXY(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude
                    ) if self.robot.policy.kinematics == 'holonomic' else ActionRot(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude)

                ob, reward, done, info = self.env.step(action)
                rewards.append(reward)

                if isinstance(info, Danger):
                    too_close += 1
                    min_dist.append(info.min_dist)

            if imitation_learning: human_obsv.append(torch.FloatTensor(scene))

            if isinstance(info, ReachGoal):
                success += 1
                success_times.append(self.env.global_time)
            elif isinstance(info, Collision):
                collision += 1
                collision_cases.append(i)
                collision_times.append(self.env.global_time)
            elif isinstance(info, Timeout):
                timeout += 1
                timeout_cases.append(i)
                timeout_times.append(self.env.time_limit)
            else:
                raise ValueError('Invalid end signal from environment')

            if update_memory:
                self.update_memory(states, actions, rewards,
                                   imitation_learning)

            # episode result
            cumulative_rewards.append(
                sum([
                    pow(self.gamma,
                        t * self.robot.time_step * self.robot.v_pref) * reward
                    for t, reward in enumerate(rewards)
                ]))
            # if i % 100 == 0 and i > 0:
            #     logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k))

        # episodes statistics
        success_rate = success / k
        collision_rate = collision / k
        assert success + collision + timeout == k
        avg_nav_time = sum(success_times) / len(
            success_times) if success_times else self.env.time_limit

        extra_info = '' if episode is None else 'in episode {} '.format(
            episode)
        logging.info(
            '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}'
            .format(phase.upper(), extra_info, success_rate, collision_rate,
                    avg_nav_time, statistics.mean(cumulative_rewards),
                    (statistics.stdev(cumulative_rewards)
                     if len(cumulative_rewards) > 1 else 0.0)))
        print(
            '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}'
            .format(phase.upper(), extra_info, success_rate, collision_rate,
                    avg_nav_time, statistics.mean(cumulative_rewards),
                    (statistics.stdev(cumulative_rewards)
                     if len(cumulative_rewards) > 1 else 0.0)))

        if phase in ['val', 'test']:
            total_time = sum(success_times + collision_times +
                             timeout_times) * self.robot.time_step
            logging.info('Frequency of being in danger: %.2f',
                         too_close / total_time)

        if print_failure:
            logging.info('Collision cases: ' +
                         ' '.join([str(x) for x in collision_cases]))
            logging.info('Timeout cases: ' +
                         ' '.join([str(x) for x in timeout_cases]))

        if imitation_learning:
            return human_obsv
コード例 #26
0
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)
        """

        if hasattr(self.model, 'training'):
            self.model.eval()
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.self_state.v_pref)

        occupancy_maps = None
        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(
                len(self.action_space))]
            max_activations = torch.randn(10, 2)
        else:
            self.action_values = list()
            max_value = float('-inf')
            max_action = None
            max_activations = None

            action_states = [
                self.compute_action_vals_activs(action, state)
                for action in self.action_space
            ]

            action_order = []
            next_states = []
            rewards = []
            for i in range(len(action_states)):
                action_order.append(action_states[i][0])
                next_states.append(action_states[i][1])
                rewards.append(action_states[i][2])

            model_input = torch.cat(next_states, dim=0)
            values = self.model(model_input)

            for i in range(len(action_order)):
                action = action_order[i]
                value = values[i].item()
                reward = rewards[i]

                action_value = reward + pow(
                    self.gamma,
                    self.time_step * state.self_state.v_pref) * value

                if action_value > max_value:
                    max_value = action_value
                    max_action = action

        if self.phase == 'train':
            self.last_state = self.transform(state)
        self.last_activation = max_activations
        return max_action
コード例 #27
0
ファイル: multi_human_rl.py プロジェクト: SJWang2015/AEMCARL
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.self_state.v_pref)

        occupancy_maps = None
        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(
                len(self.action_space))]
        else:
            self.action_values = list()
            max_value = float('-inf')
            max_action = None
            need_build_map = True
            single_step_cnt = 0
            for action in self.action_space:
                next_self_state = self.propagate(state.self_state, action)

                if self.query_env:
                    next_human_states, reward, done, info = self.env.onestep_lookahead(
                        action, need_build_map)
                    need_build_map = False
                else:
                    next_human_states = [
                        self.propagate(
                            human_state,
                            ActionXY(human_state.vx, human_state.vy))
                        for human_state in state.human_states
                    ]
                    self.map.build_map_cpu(next_human_states)
                    collision_probability = self.map.compute_occupied_probability(
                        next_self_state)
                    reward = self.compute_reward(next_self_state,
                                                 next_human_states,
                                                 collision_probability)

                batch_next_states = torch.cat([
                    torch.Tensor([next_self_state + next_human_state]).to(
                        self.device) for next_human_state in next_human_states
                ],
                                              dim=0)
                rotated_batch_input = self.rotate(batch_next_states).unsqueeze(
                    0)
                if self.with_om:
                    if occupancy_maps is None:
                        occupancy_maps = self.build_occupancy_maps(
                            next_human_states).unsqueeze(0)
                    rotated_batch_input = torch.cat(
                        [rotated_batch_input, occupancy_maps], dim=2)
                # VALUE UPDATE

                value, score = self.model(rotated_batch_input)
                next_state_value = value.data.item()

                value = reward + pow(
                    self.gamma, self.time_step *
                    state.self_state.v_pref) * next_state_value

                self.action_values.append(value)
                # ionly save the step cnt of used action
                if value > max_value:
                    max_value = value
                    max_action = action
                    if self.use_rate_statistic:
                        single_step_cnt = self.model.get_step_cnt()

            if max_action is None:
                raise ValueError('Value network is not well trained. ')
            if self.use_rate_statistic:
                if single_step_cnt < len(self.statistic_info):
                    self.statistic_info[single_step_cnt] += 1
                else:
                    print("step count too large!!")
                pass

        if self.phase == 'train':
            self.last_state = self.transform(state)
        return max_action
コード例 #28
0
ファイル: tree_searchrl.py プロジェクト: nubot-nudt/SG-DQN
    def V_planning(self, state, depth, width):
        """ Plans n steps into future based on state action value function. Computes the value for the current state as well as the trajectories
        defined as a list of (state, action, reward) triples
        """
        # current_state_value = self.value_estimator(state)
        robot_state_batch = state[0]
        human_state_batch = state[1]
        if state[1] is None:
            if depth == 0:
                q_value = torch.Tensor(self.value_estimator(state))
                max_action_value, max_action_indexes = torch.max(q_value,
                                                                 dim=1)
                trajs = []
                for i in range(robot_state_batch.shape[0]):
                    cur_state = (robot_state_batch[i, :, :].unsqueeze(0), None)
                    trajs.append([(cur_state, None, None)])
                return max_action_value, max_action_indexes, trajs
            else:
                q_value = torch.Tensor(self.value_estimator(state))
                max_action_value, max_action_indexes = torch.topk(q_value,
                                                                  width,
                                                                  dim=1)
            action_stay = []
            for i in range(robot_state_batch.shape[0]):
                if self.kinematics == "holonomic":
                    action_stay.append(ActionXY(0, 0))
                else:
                    action_stay.append(ActionRot(0, 0))
            pre_next_state = None
            next_robot_state_batch = None
            next_human_state_batch = None
            reward_est = torch.zeros(state[0].shape[0], width) * float('inf')

            for i in range(robot_state_batch.shape[0]):
                cur_state = (robot_state_batch[i, :, :].unsqueeze(0), None)
                next_human_state = None
                for j in range(width):
                    cur_action = self.action_space[max_action_indexes[i][j]]
                    next_robot_state = self.compute_next_robot_state(
                        cur_state[0], cur_action)
                    if next_robot_state_batch is None:
                        next_robot_state_batch = next_robot_state
                    else:
                        next_robot_state_batch = torch.cat(
                            (next_robot_state_batch, next_robot_state), dim=0)
                    reward_est[i][
                        j], _ = self.reward_estimator.estimate_reward_on_predictor(
                            tensor_to_joint_state(cur_state),
                            tensor_to_joint_state(
                                (next_robot_state, next_human_state)))

            next_state_batch = (next_robot_state_batch, next_human_state_batch)
            if self.planning_depth - depth >= 2 and self.planning_depth > 2:
                cur_width = 1
            else:
                cur_width = int(self.planning_width / 2)
            next_values, next_action_indexes, next_trajs = self.V_planning(
                next_state_batch, depth - 1, cur_width)
            next_values = next_values.view(state[0].shape[0], width)
            returns = (reward_est + self.get_normalized_gamma() * next_values +
                       max_action_value) / 2

            max_action_return, max_action_index = torch.max(returns, dim=1)
            trajs = []
            max_returns = []
            max_actions = []
            for i in range(robot_state_batch.shape[0]):
                cur_state = (robot_state_batch[i, :, :].unsqueeze(0), None)
                action_id = max_action_index[i]
                trajs_id = i * width + action_id
                action = max_action_indexes[i][action_id]
                next_traj = next_trajs[trajs_id]
                trajs.append([(cur_state, action, reward_est)] + next_traj)
                max_returns.append(max_action_return[i].data)
                max_actions.append(action)
            max_returns = torch.tensor(max_returns)
            return max_returns, max_actions, trajs
        else:
            if depth == 0:
                q_value = torch.Tensor(self.value_estimator(state))
                max_action_value, max_action_indexes = torch.max(q_value,
                                                                 dim=1)
                trajs = []
                for i in range(robot_state_batch.shape[0]):
                    cur_state = (robot_state_batch[i, :, :].unsqueeze(0),
                                 human_state_batch[i, :, :].unsqueeze(0))
                    trajs.append([(cur_state, None, None)])
                return max_action_value, max_action_indexes, trajs
            else:
                q_value = torch.Tensor(self.value_estimator(state))
                max_action_value, max_action_indexes = torch.topk(q_value,
                                                                  width,
                                                                  dim=1)
            action_stay = []
            for i in range(robot_state_batch.shape[0]):
                if self.kinematics == "holonomic":
                    action_stay.append(ActionXY(0, 0))
                else:
                    action_stay.append(ActionRot(0, 0))
            _, pre_next_state = self.state_predictor(state, action_stay)
            next_robot_state_batch = None
            next_human_state_batch = None
            reward_est = torch.zeros(state[0].shape[0], width) * float('inf')

            for i in range(robot_state_batch.shape[0]):
                cur_state = (robot_state_batch[i, :, :].unsqueeze(0),
                             human_state_batch[i, :, :].unsqueeze(0))
                next_human_state = pre_next_state[i, :, :].unsqueeze(0)
                for j in range(width):
                    cur_action = self.action_space[max_action_indexes[i][j]]
                    next_robot_state = self.compute_next_robot_state(
                        cur_state[0], cur_action)
                    if next_robot_state_batch is None:
                        next_robot_state_batch = next_robot_state
                        next_human_state_batch = next_human_state
                    else:
                        next_robot_state_batch = torch.cat(
                            (next_robot_state_batch, next_robot_state), dim=0)
                        next_human_state_batch = torch.cat(
                            (next_human_state_batch, next_human_state), dim=0)
                    reward_est[i][
                        j], _ = self.reward_estimator.estimate_reward_on_predictor(
                            tensor_to_joint_state(cur_state),
                            tensor_to_joint_state(
                                (next_robot_state, next_human_state)))
            next_state_batch = (next_robot_state_batch, next_human_state_batch)
            if self.planning_depth - depth >= 2 and self.planning_depth > 2:
                cur_width = 1
            else:
                cur_width = int(self.planning_width / 2)
            next_values, next_action_indexes, next_trajs = self.V_planning(
                next_state_batch, depth - 1, cur_width)
            next_values = next_values.view(state[0].shape[0], width)
            returns = (reward_est + self.get_normalized_gamma() * next_values +
                       max_action_value) / 2

            max_action_return, max_action_index = torch.max(returns, dim=1)
            trajs = []
            max_returns = []
            max_actions = []
            for i in range(robot_state_batch.shape[0]):
                cur_state = (robot_state_batch[i, :, :].unsqueeze(0),
                             human_state_batch[i, :, :].unsqueeze(0))
                action_id = max_action_index[i]
                trajs_id = i * width + action_id
                action = max_action_indexes[i][action_id]
                next_traj = next_trajs[trajs_id]
                trajs.append([(cur_state, action, reward_est)] + next_traj)
                max_returns.append(max_action_return[i].data)
                max_actions.append(action)
            max_returns = torch.tensor(max_returns)
            return max_returns, max_actions, trajs
コード例 #29
0
    def predict(self, state):
        """
        Takes pairwise joint state as input to value network and output action.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length).
        If with_costmap is True, the dangerous actions predicted by the value network will be screened out to avoid static obstacles on the map.
        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.self_state.v_pref)

        occupancy_maps = None
        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action = self.action_space[np.random.choice(
                len(self.action_space))]
        else:
            self.action_values = list()
            max_value = float('-inf')
            max_action = None
            for action in self.action_space:
                next_self_state = self.propagate(state.self_state, action)
                next_self_state_further = self.propagate_more(
                    state.self_state, action)

                # abort actions which will probably cause collision with static obstacles in the costmap
                if self.with_costmap is True:
                    cost = self.compute_cost(next_self_state_further)
                    if cost > 0:
                        print("********** Abort action:", action,
                              " with cost:", cost,
                              " that will hit the obstacles.")
                        continue

                if self.query_env:
                    next_human_states, reward, done, info = self.env.onestep_lookahead(
                        action)
                else:
                    next_human_states = [
                        self.propagate(
                            human_state,
                            ActionXY(human_state.vx, human_state.vy))
                        for human_state in state.human_states
                    ]
                    reward = self.compute_reward(next_self_state,
                                                 next_human_states)
                batch_next_states = torch.cat([
                    torch.Tensor([next_self_state + next_human_state]).to(
                        self.device) for next_human_state in next_human_states
                ],
                                              dim=0)
                rotated_batch_input = self.rotate(batch_next_states).unsqueeze(
                    0)

                if self.with_om:
                    if occupancy_maps is None:
                        occupancy_maps = self.build_occupancy_maps(
                            next_human_states).unsqueeze(0)
                    rotated_batch_input = torch.cat(
                        [rotated_batch_input, occupancy_maps], dim=2)
                # VALUE UPDATE
                next_state_value = self.model(rotated_batch_input).data.item()
                value = reward + pow(
                    self.gamma, self.time_step *
                    state.self_state.v_pref) * next_state_value
                self.action_values.append(value)
                if value > max_value:
                    max_value = value
                    max_action = action
                    # print("********** choose action:", action)
                    # print("********** cost:", cost)

            if max_action is None:
                # if the robot is trapped, choose the turning action to escape
                max_action = ActionRot(0, 0.78)
                print("The robot is trapped. Rotate in place to escape......")

        if self.phase == 'train':
            self.last_state = self.transform(state)

        return max_action
コード例 #30
0
    def step(self, action, update=True):
        """
        Compute actions for all agents, detect collision, update environment and return (ob, reward, done, info)
        """
        action = self.robot.policy.clip_action(action, self.robot.v_pref)

        if self.robot.kinematics == 'unicycle':
            self.desiredVelocity[0] = np.clip(
                self.desiredVelocity[0] + action.v, -self.robot.v_pref,
                self.robot.v_pref)
            action = ActionRot(self.desiredVelocity[0], action.r)

        human_actions = []  # a list of all humans' actions
        for i, human in enumerate(self.humans):
            # observation for humans is always coordinates
            ob = []
            for other_human in self.humans:
                if other_human != human:
                    # Chance for one human to be blind to some other humans
                    if self.random_unobservability and i == 0:
                        if np.random.random(
                        ) <= self.unobservable_chance or not self.detect_visible(
                                human, other_human):
                            ob.append(self.dummy_human.get_observable_state())
                        else:
                            ob.append(other_human.get_observable_state())
                    # Else detectable humans are always observable to each other
                    elif self.detect_visible(human, other_human):
                        ob.append(other_human.get_observable_state())
                    else:
                        ob.append(self.dummy_human.get_observable_state())

            if self.robot.visible:
                # Chance for one human to be blind to robot
                if self.random_unobservability and i == 0:
                    if np.random.random(
                    ) <= self.unobservable_chance or not self.detect_visible(
                            human, self.robot):
                        ob += [self.dummy_robot.get_observable_state()]
                    else:
                        ob += [self.robot.get_observable_state()]
                # Else human will always see visible robots
                elif self.detect_visible(human, self.robot):
                    ob += [self.robot.get_observable_state()]
                else:
                    ob += [self.dummy_robot.get_observable_state()]

            human_actions.append(human.act(ob))

        # compute reward and episode info
        reward, done, episode_info = self.calc_reward(action)

        # apply action and update all agents
        self.robot.step(action)
        for i, human_action in enumerate(human_actions):
            self.humans[i].step(human_action)
        self.global_time += self.time_step  # max episode length=time_limit/time_step

        # compute the observation
        ob = self.generate_ob(reset=False)

        info = {'info': episode_info}

        # Update all humans' goals randomly midway through episode
        if self.random_goal_changing:
            if self.global_time % 5 == 0:
                self.update_human_goals_randomly()

        # Update a specific human's goal once its reached its original goal
        if self.end_goal_changing:
            for human in self.humans:
                if norm(
                    (human.gx - human.px, human.gy - human.py)) < human.radius:
                    self.update_human_goal(human)

        return ob, reward, done, info