Example #1
0
 def possible_actions(self):
     s = self.state
     # return the id of possible actions
     # find empty blocks (nothing on top)
     empty_blocks = [b for b in range(self.blocks) if self.clear(b, s)]
     actions = [
         [a, b] for a in empty_blocks for b in empty_blocks
         if not self.destination_is_table(a, b) or not self.on_table(a, s)
     ]  # condition means if A sits on the table you can not pick it and put it on the table
     return np.array(
         [vec2id(x, [self.blocks, self.blocks]) for x in actions])
Example #2
0
    def _hash_state(self, s):
        """
        Returns a unique id for a given state.
        Essentially, enumerate all possible states and return the ID associated
        with *s*.

        Under the hood: first, discretize continuous dimensions into bins
        as necessary. Then map the binstate to an integer.
        """
        ds = self.bin_state(s)
        return vec2id(ds, self.bins_per_dim)
Example #3
0
    def bellman_backup(self, s, a, ns_samples, policy=None):
        """Applied Bellman Backup to state-action pair s,a
        i.e. Q(s,a) = E[r + discount_factor * V(s')]
        If policy is given then Q(s,a) =  E[r + discount_factor * Q(s',pi(s')]

        Args:
            s (ndarray):        The current state
            a (int):            The action taken in state s
            ns_samples(int):    Number of next state samples to use.
            policy (Policy):    Policy object to use for sampling actions.
        """
        Q = self.representation.q_look_ahead(s, a, ns_samples, policy)
        s_index = vec2id(
            self.representation.bin_state(s), self.representation.bins_per_dim
        )
        self.representation.weight[a, s_index] = Q
Example #4
0
File: pst.py Project: kngwyu/rlpy3
    def vecList2idHelper(self, x, actionIDs, ind, curActionList, maxValue,
                         limits):
        """
        Helper method for vecList2id().

        :returns: a list of unique id's based on possible permutations of this list of lists.

        See vecList2id()

        """
        # x[ind] is one of the lists, e.g [0, 2] or [1,2]
        for curAction in x[ind]:
            partialActionAssignment = curActionList[:]
            partialActionAssignment.append(curAction)
            # We have reached the final list, assignment is complete
            if ind == len(x) - 1:
                # eg [0,1,0,2] and [3,3,3,3]
                actionIDs.append(vec2id(partialActionAssignment, limits))
            else:
                self.vecList2idHelper(x, actionIDs, ind + 1,
                                      partialActionAssignment, maxValue,
                                      limits)
Example #5
0
    def pi2(self, s, terminal, p_actions):
        domain = self.representation.domain
        if class_name(domain) not in self.SUPPORTED_DOMAINS:
            raise ValueError("ERROR: There is no fixed policy defined for %s" %
                             class_name(domain))

        if class_name(domain) == "GridWorld":
            # Actions are Up, Down, Left, Right
            if self.policy_name not in self.GRID_WORLD_POLICY_NAMES:
                raise ValueError(
                    "Error: There is no GridWorld policy with name %s" %
                    self.policy_name)

            if self.policy_name == "cw_circle":
                # Cycle through actions, starting with 0, causing agent to go
                # in loop
                if not hasattr(self, "curAction"):
                    # it doesn't exist yet, so initialize it [immediately
                    # incremented]
                    self.curAction = 0
                while not (self.curAction in domain.possible_actions(s)):
                    # We can't do something simple because of the order
                    # in which actions are defined.
                    # must do switch statement
                    if self.curAction == 0:  # up
                        self.curAction = 3
                    elif self.curAction == 3:  # right
                        self.curAction = 1
                    elif self.curAction == 1:  # down
                        self.curAction = 2
                    elif self.curAction == 2:  # left
                        self.curAction = 0
                    else:
                        raise ValueError(
                            "Something terrible happened..."
                            "got an invalid action on GridWorld Fixed Policy")
            elif self.policy_name == "ccw_circle":
                # Cycle through actions, starting with 0, causing agent to go
                # in loop
                if not hasattr(self, "curAction"):
                    # it doesn't exist yet, so initialize it
                    self.curAction = 1
                while not (self.curAction in domain.possible_actions(s)):
                    # We can't do something simple
                    # because of the order in which actions are defined
                    if self.curAction == 3:  # right
                        self.curAction = 0
                    elif self.curAction == 0:  # up
                        self.curAction = 2
                    elif self.curAction == 2:  # left
                        self.curAction = 1
                    elif self.curAction == 1:  # down
                        self.curAction = 3
                    else:
                        raise ValueError(
                            "Something terrible happened..."
                            "got an invalid action on GridWorld Fixed Policy")
            else:
                raise ValueError(
                    "Error: No policy defined with name {}, but listed"
                    " in GRID_WORLD_POLICY_NAMES".format(self.policy_name))
            return self.curAction

        if class_name(domain) == "InfCartPoleBalance":
            # Fixed policy rotate the pendulum in the opposite direction of the
            # thetadot
            theta, thetadot = s
            if thetadot > 0:
                return 2
            else:
                return 0
        if class_name(domain) == "BlocksWorld":
            # Fixed policy rotate the blocksworld = Optimal Policy
            # (Always pick the next piece of the tower and move it to the tower
            # Policy: Identify the top of the tower.
            # move the next piece on the tower with 95% chance 5% take a random
            # action

            # Random Action with some probability
            # TODO fix is_terminal use here
            if self.random_state.rand() < 0.3 or domain.is_terminal():
                return self.random_state.choice(domain.possible_actions(s))

            # non-Random Policy
            # next_block is the block that should be stacked on the top of the tower
            # wrong_block is the highest block stacked on the top of the next_block
            # Wrong_tower_block is the highest stacked on the top of the tower

            # Length of the tower assumed to be built correctly.
            correct_tower_size = 0
            while True:
                # Check the next block
                block = correct_tower_size
                if (block == 0 and domain.on_table(block, s)) or domain.on(
                        block, block - 1, s):
                    # This block is on the right position, check the next block
                    correct_tower_size += 1
                else:
                    # print s
                    # print "Incorrect block:", block
                    # The block is on the wrong place.
                    # 1. Check if the tower is empty => If not take one block from the
                    # tower and put it on the table
                    # 2. check to see if this wrong block is empty => If not put one
                    # block from its stack and put on the table
                    # 3. Otherwise move this block on the tower

                    ###################
                    # 1
                    ###################
                    # If the first block is in the wrong place, then the tower
                    # top which is table is empty by definition
                    if block != 0:
                        ideal_tower_top = block - 1
                        tower_top = domain.towerTop(ideal_tower_top, s)
                        if tower_top != ideal_tower_top:
                            # There is a wrong block there hence we should put
                            # it on the table first
                            return (
                                # put the top of the tower on the table since
                                # it is not correct
                                domain.getActionPutAonTable(tower_top))
                    ###################
                    # 2
                    ###################
                    block_top = domain.towerTop(block, s)
                    if block_top != block:
                        # The target block to be stacked is not empty
                        return domain.getActionPutAonTable(block_top)
                    ###################
                    # 3
                    ###################
                    if block == 0:
                        return domain.getActionPutAonTable(block)
                    else:
                        return domain.getActionPutAonB(block, block - 1)
        if class_name(domain) == "IntruderMonitoring":
            # Each UAV assign themselves to a target
            # Each UAV finds the closest danger zone to its target and go towards there.
            # If UAVs_num > Target, the rest will hold position
            # Move all agents based on the taken action
            agents = np.array(s[:domain.NUMBER_OF_AGENTS * 2].reshape(-1, 2))
            targets = np.array(s[domain.NUMBER_OF_AGENTS * 2:].reshape(-1, 2))
            zones = domain.danger_zone_locations
            # Default action is hold
            actions = np.ones(len(agents), dtype=np.integer) * 4
            planned_agents_num = min(len(agents), len(targets))
            for i in range(planned_agents_num):
                # Find cloasest zone (manhattan) to the corresponding target
                target = targets[i, :]
                distances = np.sum(
                    np.abs(np.tile(target, (len(zones), 1)) - zones), axis=1)
                z_row, z_col = zones[np.argmin(distances), :]
                # find the valid action
                a_row, a_col = agents[i, :]
                a = 4  # hold as a default action
                if a_row > z_row:
                    a = 0  # up
                if a_row < z_row:
                    a = 1  # down
                if a_col > z_col:
                    a = 2  # left
                if a_col < z_col:
                    a = 3  # right
                actions[i] = a
            #                print "Agent=", agents[i,:]
            #                print "Target", target
            #                print "Zone", zones[argmin(distances),:]
            #                print "Action", a
            #                print '============'
            return vec2id(actions, np.ones(len(agents), dtype=np.integer) * 5)
        if class_name(domain) == "SystemAdministrator":
            # Select a broken computer and reset it
            brokenComputers = np.where(s == 0)[0]
            if len(brokenComputers):
                return self.random_state.choice(brokenComputers)
            else:
                return domain.computers_num
        if class_name(domain) == "MountainCar":
            # Accelerate in the direction of the valley
            # WORK IN PROGRESS
            x, xdot = s
            if xdot > 0:
                return 2
            else:
                return 0
        if class_name(domain) == "PST":
            # One stays at comm, n-1 stay at target area. Whenever fuel is
            # lower than reaching the base the move back
            print(s)
            s = domain.state2Struct(s)
            uavs = domain.NUM_UAV
            print(s)
            return vec2id(np.zeros(uavs), np.ones(uavs) * 3)
Example #6
0
 def getActionPutAonB(self, A, B):
     return vec2id(np.array([A, B]), [self.blocks, self.blocks])
Example #7
0
 def getActionPutAonTable(self, A):
     return vec2id(np.array([A, A]), [self.blocks, self.blocks])
Example #8
0
def test_transitions():
    """
    Ensure that actions result in expected state transition behavior.
    Test:
        1) Actuator and sensor failure, associated lack of reward
        2) Refuel
        3) Repair
        4) Presence of reward iff a UAV is in COMMS *and* SURVEIL
        5) UAV Crash because of lack of fuel

    """
    NUM_UAV = 2
    nPosActions = 3  # = UAVAction.SIZE
    actionLimits = nPosActions * np.ones(NUM_UAV, dtype="int")

    # Test p=1 actuator failure when not at base
    domain = PST(NUM_UAV=NUM_UAV)
    _ = domain.s0()

    domain.P_ACT_FAIL = 0.0
    domain.P_SENSOR_FAIL = 1.0

    locs = np.array([UAVLocation.COMMS, UAVLocation.COMMS])
    fuel = np.array([10, 10])
    act = np.array([ActuatorState.RUNNING, ActuatorState.RUNNING])
    sens = np.array([SensorState.RUNNING, SensorState.RUNNING])
    actionVec = np.array([UAVAction.LOITER, UAVAction.LOITER])
    a = vec2id(actionVec, actionLimits)
    domain.state = domain.properties2StateVec(locs, fuel, act, sens)
    r, ns, t, possA = domain.step(a)
    # Assert that only change was reduction in fuel and failure of sensor
    assert np.array_equiv(
        ns, domain.properties2StateVec(locs, fuel - 1, act, np.array([0, 0])))

    # Test location change movement
    actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs + 1, fuel - 2, act, np.array([0, 0])))

    # Test p=1 sensor failure when not at base
    domain.FUEL_BURN_REWARD_COEFF = 0.0
    domain.MOVE_REWARD_COEFF = 0.0
    domain.P_ACT_FAIL = 1.0
    actionVec = np.array([UAVAction.RETREAT, UAVAction.LOITER])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs + [0, 1], fuel - 3, np.array([0, 0]),
                                   np.array([0, 0])),
    )

    # Test that no reward was received since the sensor is broken
    assert r == 0

    # Test Refuel
    # After action below will be in locs + [-1,1], or REFUEL and SURVEIL
    # respectively, with 4 fuel units consumed.  Must LOITER to refill fuel though
    actionVec = np.array([UAVAction.RETREAT, UAVAction.RETREAT])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    locs = np.array([UAVLocation.REFUEL, UAVLocation.COMMS])
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs, fuel - 4, np.array([0, 0]),
                                   np.array([0, 0])),
    )
    # Refuel occurs after loitering
    actionVec = np.array([UAVAction.LOITER, UAVAction.RETREAT])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    fuel = np.array([10, 5])
    locs = np.array([UAVLocation.REFUEL, UAVLocation.REFUEL])
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs, fuel, np.array([0, 0]),
                                   np.array([0, 0])))

    # Test repair [note uav2 was never refueled since never loitered]
    actionVec = np.array([UAVAction.RETREAT, UAVAction.RETREAT])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs - 1, fuel - 1, np.array([0, 0]),
                                   np.array([0, 0])),
    )

    # Repair only occurs after loiter [no fuel burned for BASE/REFUEL loiter
    actionVec = np.array([UAVAction.LOITER, UAVAction.LOITER])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs - 1, fuel - 1, np.array([1, 1]),
                                   np.array([1, 1])),
    )

    # Test comms but no surveillance
    domain.P_ACT_FAIL = 0.0
    domain.P_SENSOR_FAIL = 0.0
    actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs, fuel - 2, np.array([1, 1]),
                                   np.array([1, 1])),
    )
    actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs + 1, fuel - 3, np.array([1, 1]),
                                   np.array([1, 1])),
    )
    assert r == 0  # no reward because only have comms, no surveil

    # add 2 units of extra fuel to each and move
    domain.state = domain.properties2StateVec(locs + 1, fuel - 1,
                                              np.array([1, 1]),
                                              np.array([1, 1]))

    # Test surveillance but no comms
    actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs + 2, fuel - 2, np.array([1, 1]),
                                   np.array([1, 1])),
    )
    assert r == 0  # no reward because have only surveil, no comms

    # Test comms and surveillance
    actionVec = np.array([UAVAction.RETREAT, UAVAction.LOITER])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    locs = np.array([UAVLocation.COMMS, UAVLocation.SURVEIL])
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs, fuel - 3, np.array([1, 1]),
                                   np.array([1, 1])),
    )
    assert r == 0
    # reward based on "s", not "ns", pickup reward here
    actionVec = np.array([UAVAction.LOITER, UAVAction.LOITER])
    a = vec2id(actionVec, actionLimits)
    r, ns, t, possA = domain.step(a)
    locs = np.array([UAVLocation.COMMS, UAVLocation.SURVEIL])
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs, fuel - 4, np.array([1, 1]),
                                   np.array([1, 1])),
    )
    assert r == domain.SURVEIL_REWARD

    # Test crash
    # Since reward based on "s" not "ns", also pickup reward from prev step
    actionVec = np.array([UAVAction.RETREAT, UAVAction.RETREAT])
    a = vec2id(actionVec, actionLimits)
    r, ns, is_terminated, possA = domain.step(a)
    assert np.array_equiv(
        ns,
        domain.properties2StateVec(locs - 1, fuel - 5, np.array([1, 1]),
                                   np.array([1, 1])),
    )
    assert is_terminated
    assert r == domain.CRASH_REWARD + domain.SURVEIL_REWARD