def possible_actions(self): s = self.state # return the id of possible actions # find empty blocks (nothing on top) empty_blocks = [b for b in range(self.blocks) if self.clear(b, s)] actions = [ [a, b] for a in empty_blocks for b in empty_blocks if not self.destination_is_table(a, b) or not self.on_table(a, s) ] # condition means if A sits on the table you can not pick it and put it on the table return np.array( [vec2id(x, [self.blocks, self.blocks]) for x in actions])
def _hash_state(self, s): """ Returns a unique id for a given state. Essentially, enumerate all possible states and return the ID associated with *s*. Under the hood: first, discretize continuous dimensions into bins as necessary. Then map the binstate to an integer. """ ds = self.bin_state(s) return vec2id(ds, self.bins_per_dim)
def bellman_backup(self, s, a, ns_samples, policy=None): """Applied Bellman Backup to state-action pair s,a i.e. Q(s,a) = E[r + discount_factor * V(s')] If policy is given then Q(s,a) = E[r + discount_factor * Q(s',pi(s')] Args: s (ndarray): The current state a (int): The action taken in state s ns_samples(int): Number of next state samples to use. policy (Policy): Policy object to use for sampling actions. """ Q = self.representation.q_look_ahead(s, a, ns_samples, policy) s_index = vec2id( self.representation.bin_state(s), self.representation.bins_per_dim ) self.representation.weight[a, s_index] = Q
def vecList2idHelper(self, x, actionIDs, ind, curActionList, maxValue, limits): """ Helper method for vecList2id(). :returns: a list of unique id's based on possible permutations of this list of lists. See vecList2id() """ # x[ind] is one of the lists, e.g [0, 2] or [1,2] for curAction in x[ind]: partialActionAssignment = curActionList[:] partialActionAssignment.append(curAction) # We have reached the final list, assignment is complete if ind == len(x) - 1: # eg [0,1,0,2] and [3,3,3,3] actionIDs.append(vec2id(partialActionAssignment, limits)) else: self.vecList2idHelper(x, actionIDs, ind + 1, partialActionAssignment, maxValue, limits)
def pi2(self, s, terminal, p_actions): domain = self.representation.domain if class_name(domain) not in self.SUPPORTED_DOMAINS: raise ValueError("ERROR: There is no fixed policy defined for %s" % class_name(domain)) if class_name(domain) == "GridWorld": # Actions are Up, Down, Left, Right if self.policy_name not in self.GRID_WORLD_POLICY_NAMES: raise ValueError( "Error: There is no GridWorld policy with name %s" % self.policy_name) if self.policy_name == "cw_circle": # Cycle through actions, starting with 0, causing agent to go # in loop if not hasattr(self, "curAction"): # it doesn't exist yet, so initialize it [immediately # incremented] self.curAction = 0 while not (self.curAction in domain.possible_actions(s)): # We can't do something simple because of the order # in which actions are defined. # must do switch statement if self.curAction == 0: # up self.curAction = 3 elif self.curAction == 3: # right self.curAction = 1 elif self.curAction == 1: # down self.curAction = 2 elif self.curAction == 2: # left self.curAction = 0 else: raise ValueError( "Something terrible happened..." "got an invalid action on GridWorld Fixed Policy") elif self.policy_name == "ccw_circle": # Cycle through actions, starting with 0, causing agent to go # in loop if not hasattr(self, "curAction"): # it doesn't exist yet, so initialize it self.curAction = 1 while not (self.curAction in domain.possible_actions(s)): # We can't do something simple # because of the order in which actions are defined if self.curAction == 3: # right self.curAction = 0 elif self.curAction == 0: # up self.curAction = 2 elif self.curAction == 2: # left self.curAction = 1 elif self.curAction == 1: # down self.curAction = 3 else: raise ValueError( "Something terrible happened..." "got an invalid action on GridWorld Fixed Policy") else: raise ValueError( "Error: No policy defined with name {}, but listed" " in GRID_WORLD_POLICY_NAMES".format(self.policy_name)) return self.curAction if class_name(domain) == "InfCartPoleBalance": # Fixed policy rotate the pendulum in the opposite direction of the # thetadot theta, thetadot = s if thetadot > 0: return 2 else: return 0 if class_name(domain) == "BlocksWorld": # Fixed policy rotate the blocksworld = Optimal Policy # (Always pick the next piece of the tower and move it to the tower # Policy: Identify the top of the tower. # move the next piece on the tower with 95% chance 5% take a random # action # Random Action with some probability # TODO fix is_terminal use here if self.random_state.rand() < 0.3 or domain.is_terminal(): return self.random_state.choice(domain.possible_actions(s)) # non-Random Policy # next_block is the block that should be stacked on the top of the tower # wrong_block is the highest block stacked on the top of the next_block # Wrong_tower_block is the highest stacked on the top of the tower # Length of the tower assumed to be built correctly. correct_tower_size = 0 while True: # Check the next block block = correct_tower_size if (block == 0 and domain.on_table(block, s)) or domain.on( block, block - 1, s): # This block is on the right position, check the next block correct_tower_size += 1 else: # print s # print "Incorrect block:", block # The block is on the wrong place. # 1. Check if the tower is empty => If not take one block from the # tower and put it on the table # 2. check to see if this wrong block is empty => If not put one # block from its stack and put on the table # 3. Otherwise move this block on the tower ################### # 1 ################### # If the first block is in the wrong place, then the tower # top which is table is empty by definition if block != 0: ideal_tower_top = block - 1 tower_top = domain.towerTop(ideal_tower_top, s) if tower_top != ideal_tower_top: # There is a wrong block there hence we should put # it on the table first return ( # put the top of the tower on the table since # it is not correct domain.getActionPutAonTable(tower_top)) ################### # 2 ################### block_top = domain.towerTop(block, s) if block_top != block: # The target block to be stacked is not empty return domain.getActionPutAonTable(block_top) ################### # 3 ################### if block == 0: return domain.getActionPutAonTable(block) else: return domain.getActionPutAonB(block, block - 1) if class_name(domain) == "IntruderMonitoring": # Each UAV assign themselves to a target # Each UAV finds the closest danger zone to its target and go towards there. # If UAVs_num > Target, the rest will hold position # Move all agents based on the taken action agents = np.array(s[:domain.NUMBER_OF_AGENTS * 2].reshape(-1, 2)) targets = np.array(s[domain.NUMBER_OF_AGENTS * 2:].reshape(-1, 2)) zones = domain.danger_zone_locations # Default action is hold actions = np.ones(len(agents), dtype=np.integer) * 4 planned_agents_num = min(len(agents), len(targets)) for i in range(planned_agents_num): # Find cloasest zone (manhattan) to the corresponding target target = targets[i, :] distances = np.sum( np.abs(np.tile(target, (len(zones), 1)) - zones), axis=1) z_row, z_col = zones[np.argmin(distances), :] # find the valid action a_row, a_col = agents[i, :] a = 4 # hold as a default action if a_row > z_row: a = 0 # up if a_row < z_row: a = 1 # down if a_col > z_col: a = 2 # left if a_col < z_col: a = 3 # right actions[i] = a # print "Agent=", agents[i,:] # print "Target", target # print "Zone", zones[argmin(distances),:] # print "Action", a # print '============' return vec2id(actions, np.ones(len(agents), dtype=np.integer) * 5) if class_name(domain) == "SystemAdministrator": # Select a broken computer and reset it brokenComputers = np.where(s == 0)[0] if len(brokenComputers): return self.random_state.choice(brokenComputers) else: return domain.computers_num if class_name(domain) == "MountainCar": # Accelerate in the direction of the valley # WORK IN PROGRESS x, xdot = s if xdot > 0: return 2 else: return 0 if class_name(domain) == "PST": # One stays at comm, n-1 stay at target area. Whenever fuel is # lower than reaching the base the move back print(s) s = domain.state2Struct(s) uavs = domain.NUM_UAV print(s) return vec2id(np.zeros(uavs), np.ones(uavs) * 3)
def getActionPutAonB(self, A, B): return vec2id(np.array([A, B]), [self.blocks, self.blocks])
def getActionPutAonTable(self, A): return vec2id(np.array([A, A]), [self.blocks, self.blocks])
def test_transitions(): """ Ensure that actions result in expected state transition behavior. Test: 1) Actuator and sensor failure, associated lack of reward 2) Refuel 3) Repair 4) Presence of reward iff a UAV is in COMMS *and* SURVEIL 5) UAV Crash because of lack of fuel """ NUM_UAV = 2 nPosActions = 3 # = UAVAction.SIZE actionLimits = nPosActions * np.ones(NUM_UAV, dtype="int") # Test p=1 actuator failure when not at base domain = PST(NUM_UAV=NUM_UAV) _ = domain.s0() domain.P_ACT_FAIL = 0.0 domain.P_SENSOR_FAIL = 1.0 locs = np.array([UAVLocation.COMMS, UAVLocation.COMMS]) fuel = np.array([10, 10]) act = np.array([ActuatorState.RUNNING, ActuatorState.RUNNING]) sens = np.array([SensorState.RUNNING, SensorState.RUNNING]) actionVec = np.array([UAVAction.LOITER, UAVAction.LOITER]) a = vec2id(actionVec, actionLimits) domain.state = domain.properties2StateVec(locs, fuel, act, sens) r, ns, t, possA = domain.step(a) # Assert that only change was reduction in fuel and failure of sensor assert np.array_equiv( ns, domain.properties2StateVec(locs, fuel - 1, act, np.array([0, 0]))) # Test location change movement actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs + 1, fuel - 2, act, np.array([0, 0]))) # Test p=1 sensor failure when not at base domain.FUEL_BURN_REWARD_COEFF = 0.0 domain.MOVE_REWARD_COEFF = 0.0 domain.P_ACT_FAIL = 1.0 actionVec = np.array([UAVAction.RETREAT, UAVAction.LOITER]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs + [0, 1], fuel - 3, np.array([0, 0]), np.array([0, 0])), ) # Test that no reward was received since the sensor is broken assert r == 0 # Test Refuel # After action below will be in locs + [-1,1], or REFUEL and SURVEIL # respectively, with 4 fuel units consumed. Must LOITER to refill fuel though actionVec = np.array([UAVAction.RETREAT, UAVAction.RETREAT]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) locs = np.array([UAVLocation.REFUEL, UAVLocation.COMMS]) assert np.array_equiv( ns, domain.properties2StateVec(locs, fuel - 4, np.array([0, 0]), np.array([0, 0])), ) # Refuel occurs after loitering actionVec = np.array([UAVAction.LOITER, UAVAction.RETREAT]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) fuel = np.array([10, 5]) locs = np.array([UAVLocation.REFUEL, UAVLocation.REFUEL]) assert np.array_equiv( ns, domain.properties2StateVec(locs, fuel, np.array([0, 0]), np.array([0, 0]))) # Test repair [note uav2 was never refueled since never loitered] actionVec = np.array([UAVAction.RETREAT, UAVAction.RETREAT]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs - 1, fuel - 1, np.array([0, 0]), np.array([0, 0])), ) # Repair only occurs after loiter [no fuel burned for BASE/REFUEL loiter actionVec = np.array([UAVAction.LOITER, UAVAction.LOITER]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs - 1, fuel - 1, np.array([1, 1]), np.array([1, 1])), ) # Test comms but no surveillance domain.P_ACT_FAIL = 0.0 domain.P_SENSOR_FAIL = 0.0 actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs, fuel - 2, np.array([1, 1]), np.array([1, 1])), ) actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs + 1, fuel - 3, np.array([1, 1]), np.array([1, 1])), ) assert r == 0 # no reward because only have comms, no surveil # add 2 units of extra fuel to each and move domain.state = domain.properties2StateVec(locs + 1, fuel - 1, np.array([1, 1]), np.array([1, 1])) # Test surveillance but no comms actionVec = np.array([UAVAction.ADVANCE, UAVAction.ADVANCE]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs + 2, fuel - 2, np.array([1, 1]), np.array([1, 1])), ) assert r == 0 # no reward because have only surveil, no comms # Test comms and surveillance actionVec = np.array([UAVAction.RETREAT, UAVAction.LOITER]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) locs = np.array([UAVLocation.COMMS, UAVLocation.SURVEIL]) assert np.array_equiv( ns, domain.properties2StateVec(locs, fuel - 3, np.array([1, 1]), np.array([1, 1])), ) assert r == 0 # reward based on "s", not "ns", pickup reward here actionVec = np.array([UAVAction.LOITER, UAVAction.LOITER]) a = vec2id(actionVec, actionLimits) r, ns, t, possA = domain.step(a) locs = np.array([UAVLocation.COMMS, UAVLocation.SURVEIL]) assert np.array_equiv( ns, domain.properties2StateVec(locs, fuel - 4, np.array([1, 1]), np.array([1, 1])), ) assert r == domain.SURVEIL_REWARD # Test crash # Since reward based on "s" not "ns", also pickup reward from prev step actionVec = np.array([UAVAction.RETREAT, UAVAction.RETREAT]) a = vec2id(actionVec, actionLimits) r, ns, is_terminated, possA = domain.step(a) assert np.array_equiv( ns, domain.properties2StateVec(locs - 1, fuel - 5, np.array([1, 1]), np.array([1, 1])), ) assert is_terminated assert r == domain.CRASH_REWARD + domain.SURVEIL_REWARD