def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ # TODO resolve this hack if className(self.agent) == 'PolicyEvaluation': # Policy Evaluation Case self.result = self.agent.STATS return random_state = np.random.get_state() #random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0. performance_steps = 0. performance_term = 0. performance_discounted_return = 0. for j in xrange(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performanceRun( total_steps, visualize=visualize > j) performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=remaining, totreturn=performance_return, steps=performance_steps, num_feat=self.agent.representation.features_num)) np.random.set_state(random_state)
def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ # TODO resolve this hack if className(self.agent) == 'PolicyEvaluation': # Policy Evaluation Case self.result = self.agent.STATS return random_state = np.random.get_state() #random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0. performance_steps = 0. performance_term = 0. performance_discounted_return = 0. for j in xrange(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performanceRun( total_steps, visualize=visualize > j) performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss( elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format(total_steps=total_steps, elapsed=hhmmss( elapsedTime), remaining=remaining, totreturn=performance_return, steps=performance_steps, num_feat=self.agent.representation.features_num)) np.random.set_state(random_state)
def IsTabularRepresentation(self): ''' Check to see if the representation is Tabular as Policy Iteration and Value Iteration only work with Tabular representation ''' return className(self.representation) == 'Tabular' return True
def __init__( self, job_id, representation, domain, planning_time=np.inf, convergence_threshold=.005, ns_samples=100, project_path='.', log_interval=500, show=False, epsilon=.1): super( TrajectoryBasedValueIteration, self).__init__(job_id, representation, domain, planning_time, convergence_threshold, ns_samples, project_path, log_interval, show) self.epsilon = epsilon if className(representation) == 'Tabular': self.alpha = 1
def __init__(self, job_id, representation, domain, planning_time=np.inf, convergence_threshold=.005, ns_samples=100, project_path='.', log_interval=500, show=False, epsilon=.1): super(TrajectoryBasedValueIteration, self).__init__(job_id, representation, domain, planning_time, convergence_threshold, ns_samples, project_path, log_interval, show) self.epsilon = epsilon if className(representation) == 'Tabular': self.alpha = 1
def pi2(self, s, terminal, p_actions): domain = self.representation.domain if not className(domain) in self.supportedDomains: print("ERROR: There is no fixed policy defined for %s" % className(domain)) return None if className(domain) == 'GridWorld': # Actions are Up, Down, Left, Right if not self.policyName in self.gridWorldPolicyNames: print("Error: There is no GridWorld policy with name %s" % self.policyName) return None if self.policyName == 'cw_circle': # Cycle through actions, starting with 0, causing agent to go # in loop if not hasattr(self, "curAction"): # it doesn't exist yet, so initialize it [immediately # incremented] self.curAction = 0 while (not (self.curAction in domain.possibleActions(s))): # We can't do something simple because of the order in which actions are defined # must do switch statement if self.curAction == 0: # up self.curAction = 3 elif self.curAction == 3: # right self.curAction = 1 elif self.curAction == 1: # down self.curAction = 2 elif self.curAction == 2: # left self.curAction = 0 else: print( 'Something terrible happened...got an invalid action on GridWorld Fixed Policy' ) # self.curAction = self.curAction % domain.actions_num elif self.policyName == 'ccw_circle': # Cycle through actions, starting with 0, causing agent to go # in loop if not hasattr(self, "curAction"): # it doesn't exist yet, so initialize it self.curAction = 1 while (not (self.curAction in domain.possibleActions(s))): # We can't do something simple because of the order in which actions are defined # must do switch statement if self.curAction == 3: # right self.curAction = 0 elif self.curAction == 0: # up self.curAction = 2 elif self.curAction == 2: # left self.curAction = 1 elif self.curAction == 1: # down self.curAction = 3 else: print( 'Something terrible happened...got an invalid action on GridWorld Fixed Policy' ) # self.curAction = self.curAction % domain.actions_num else: print( "Error: No policy defined with name %s, but listed in gridWorldPolicyNames" % self.policyName) print( "You need to create a switch statement for the policy name above, or remove it from gridWorldPolicyNames" ) return None return self.curAction # Cycle through actions, starting with 0, causing agent to go in other direction # if not hasattr(pi, "curAction"): # pi.curAction = domain.actions_num-1 # it doesn't exist yet, so initialize it # if not(pi.curAction in domain.possibleActions(s)): # pi.curAction -= 1 # if pi.curAction < 0: pi.curAction = domain.actions_num-1 if className(domain) == 'InfCartPoleBalance': # Fixed policy rotate the pendulum in the opposite direction of the # thetadot theta, thetadot = s if thetadot > 0: return 2 else: return 0 if className(domain) == 'BlocksWorld': # Fixed policy rotate the blocksworld = Optimal Policy (Always pick the next piece of the tower and move it to the tower # Policy: Identify the top of the tower. # move the next piece on the tower with 95% chance 5% take a random # action # Random Action with some probability # TODO fix isTerminal use here if self.random_state.rand() < .3 or domain.isTerminal(): return randSet(domain.possibleActions(s)) # non-Random Policy # next_block is the block that should be stacked on the top of the tower # wrong_block is the highest block stacked on the top of the next_block # Wrong_tower_block is the highest stacked on the top of the tower blocks = domain.blocks # Length of the tower assumed to be built correctly. correct_tower_size = 0 while True: # Check the next block block = correct_tower_size if (block == 0 and domain.on_table(block, s)) or domain.on( block, block - 1, s): # This block is on the right position, check the next block correct_tower_size += 1 else: # print s # print "Incorrect block:", block # The block is on the wrong place. # 1. Check if the tower is empty => If not take one block from the tower and put it on the table # 2. check to see if this wrong block is empty => If not put one block from its stack and put on the table # 3. Otherwise move this block on the tower ################### # 1 ################### # If the first block is in the wrong place, then the tower # top which is table is empty by definition if block != 0: ideal_tower_top = block - 1 tower_top = domain.towerTop(ideal_tower_top, s) if tower_top != ideal_tower_top: # There is a wrong block there hence we should put # it on the table first return ( # put the top of the tower on the table since # it is not correct domain.getActionPutAonTable(tower_top)) ################### # 2 ################### block_top = domain.towerTop(block, s) if block_top != block: # The target block to be stacked is not empty return domain.getActionPutAonTable(block_top) ################### # 3 ################### if block == 0: return domain.getActionPutAonTable(block) else: return domain.getActionPutAonB(block, block - 1) if className(domain) == 'IntruderMonitoring': # Each UAV assign themselves to a target # Each UAV finds the closest danger zone to its target and go towards there. # If UAVs_num > Target, the rest will hold position # Move all agents based on the taken action agents = np.array(s[:domain.NUMBER_OF_AGENTS * 2].reshape(-1, 2)) targets = np.array(s[domain.NUMBER_OF_AGENTS * 2:].reshape(-1, 2)) zones = domain.danger_zone_locations # Default action is hold actions = np.ones(len(agents), dtype=np.integer) * 4 planned_agents_num = min(len(agents), len(targets)) for i in range(planned_agents_num): # Find cloasest zone (manhattan) to the corresponding target target = targets[i, :] distances = np.sum( np.abs(np.tile(target, (len(zones), 1)) - zones), axis=1) z_row, z_col = zones[np.argmin(distances), :] # find the valid action a_row, a_col = agents[i, :] a = 4 # hold as a default action if a_row > z_row: a = 0 # up if a_row < z_row: a = 1 # down if a_col > z_col: a = 2 # left if a_col < z_col: a = 3 # right actions[i] = a # print "Agent=", agents[i,:] # print "Target", target # print "Zone", zones[argmin(distances),:] # print "Action", a # print '============' return vec2id(actions, np.ones(len(agents), dtype=np.integer) * 5) if className(domain) == 'SystemAdministrator': # Select a broken computer and reset it brokenComputers = np.where(s == 0)[0] if len(brokenComputers): return randSet(brokenComputers) else: return domain.computers_num if className(domain) == 'MountainCar': # Accelerate in the direction of the valley # WORK IN PROGRESS x, xdot = s if xdot > 0: return 2 else: return 0 if className(domain) == 'PST': # One stays at comm, n-1 stay at target area. Whenever fuel is # lower than reaching the base the move back print(s) s = domain.state2Struct(s) uavs = domain.NUM_UAV print(s) return vec2id(np.zeros(uavs), np.ones(uavs) * 3)
def printAll(self): """ Prints all class information to console. """ print(className(self)) print('=======================================') for property, value in vars(self).items(): print(property, ": ", value)
def pi2(self, s, terminal, p_actions): domain = self.representation.domain if not className(domain) in self.supportedDomains: print "ERROR: There is no fixed policy defined for %s" % className(domain) return None if className(domain) == 'GridWorld': # Actions are Up, Down, Left, Right if not self.policyName in self.gridWorldPolicyNames: print "Error: There is no GridWorld policy with name %s" % self.policyName return None if self.policyName == 'cw_circle': # Cycle through actions, starting with 0, causing agent to go # in loop if not hasattr(self, "curAction"): # it doesn't exist yet, so initialize it [immediately # incremented] self.curAction = 0 while (not(self.curAction in domain.possibleActions(s))): # We can't do something simple because of the order in which actions are defined # must do switch statement if self.curAction == 0: # up self.curAction = 3 elif self.curAction == 3: # right self.curAction = 1 elif self.curAction == 1: # down self.curAction = 2 elif self.curAction == 2: # left self.curAction = 0 else: print 'Something terrible happened...got an invalid action on GridWorld Fixed Policy' # self.curAction = self.curAction % domain.actions_num elif self.policyName == 'ccw_circle': # Cycle through actions, starting with 0, causing agent to go # in loop if not hasattr(self, "curAction"): # it doesn't exist yet, so initialize it self.curAction = 1 while (not(self.curAction in domain.possibleActions(s))): # We can't do something simple because of the order in which actions are defined # must do switch statement if self.curAction == 3: # right self.curAction = 0 elif self.curAction == 0: # up self.curAction = 2 elif self.curAction == 2: # left self.curAction = 1 elif self.curAction == 1: # down self.curAction = 3 else: print 'Something terrible happened...got an invalid action on GridWorld Fixed Policy' # self.curAction = self.curAction % domain.actions_num else: print "Error: No policy defined with name %s, but listed in gridWorldPolicyNames" % self.policyName print "You need to create a switch statement for the policy name above, or remove it from gridWorldPolicyNames" return None return self.curAction # Cycle through actions, starting with 0, causing agent to go in other direction # if not hasattr(pi, "curAction"): # pi.curAction = domain.actions_num-1 # it doesn't exist yet, so initialize it # if not(pi.curAction in domain.possibleActions(s)): # pi.curAction -= 1 # if pi.curAction < 0: pi.curAction = domain.actions_num-1 if className(domain) == 'InfCartPoleBalance': # Fixed policy rotate the pendulum in the opposite direction of the # thetadot theta, thetadot = s if thetadot > 0: return 2 else: return 0 if className(domain) == 'BlocksWorld': # Fixed policy rotate the blocksworld = Optimal Policy (Always pick the next piece of the tower and move it to the tower # Policy: Identify the top of the tower. # move the next piece on the tower with 95% chance 5% take a random # action # Random Action with some probability # TODO fix isTerminal use here if np.random.rand() < .3 or domain.isTerminal(): return randSet(domain.possibleActions(s)) # non-Random Policy # next_block is the block that should be stacked on the top of the tower # wrong_block is the highest block stacked on the top of the next_block # Wrong_tower_block is the highest stacked on the top of the tower blocks = domain.blocks # Length of the tower assumed to be built correctly. correct_tower_size = 0 while True: # Check the next block block = correct_tower_size if (block == 0 and domain.on_table(block, s)) or domain.on(block, block - 1, s): # This block is on the right position, check the next block correct_tower_size += 1 else: # print s # print "Incorrect block:", block # The block is on the wrong place. # 1. Check if the tower is empty => If not take one block from the tower and put it on the table # 2. check to see if this wrong block is empty => If not put one block from its stack and put on the table # 3. Otherwise move this block on the tower ################### # 1 ################### # If the first block is in the wrong place, then the tower # top which is table is empty by definition if block != 0: ideal_tower_top = block - 1 tower_top = domain.towerTop(ideal_tower_top, s) if tower_top != ideal_tower_top: # There is a wrong block there hence we should put # it on the table first return ( # put the top of the tower on the table since # it is not correct domain.getActionPutAonTable(tower_top) ) ################### # 2 ################### block_top = domain.towerTop(block, s) if block_top != block: # The target block to be stacked is not empty return domain.getActionPutAonTable(block_top) ################### # 3 ################### if block == 0: return domain.getActionPutAonTable(block) else: return domain.getActionPutAonB(block, block - 1) if className(domain) == 'IntruderMonitoring': # Each UAV assign themselves to a target # Each UAV finds the closest danger zone to its target and go towards there. # If UAVs_num > Target, the rest will hold position # Move all agents based on the taken action agents = np.array(s[:domain.NUMBER_OF_AGENTS * 2].reshape(-1, 2)) targets = np.array(s[domain.NUMBER_OF_AGENTS * 2:].reshape(-1, 2)) zones = domain.danger_zone_locations # Default action is hold actions = np.ones(len(agents), dtype=np.integer) * 4 planned_agents_num = min(len(agents), len(targets)) for i in xrange(planned_agents_num): # Find cloasest zone (manhattan) to the corresponding target target = targets[i, :] distances = np.sum( np.abs(np.tile(target, (len(zones), 1)) - zones), axis=1) z_row, z_col = zones[np.argmin(distances), :] # find the valid action a_row, a_col = agents[i, :] a = 4 # hold as a default action if a_row > z_row: a = 0 # up if a_row < z_row: a = 1 # down if a_col > z_col: a = 2 # left if a_col < z_col: a = 3 # right actions[i] = a # print "Agent=", agents[i,:] # print "Target", target # print "Zone", zones[argmin(distances),:] # print "Action", a # print '============' return vec2id(actions, np.ones(len(agents), dtype=np.integer) * 5) if className(domain) == 'SystemAdministrator': # Select a broken computer and reset it brokenComputers = np.where(s == 0)[0] if len(brokenComputers): return randSet(brokenComputers) else: return domain.computers_num if className(domain) == 'MountainCar': # Accelerate in the direction of the valley # WORK IN PROGRESS x, xdot = s if xdot > 0: return 2 else: return 0 if className(domain) == 'PST': # One stays at comm, n-1 stay at target area. Whenever fuel is # lower than reaching the base the move back print s s = domain.state2Struct(s) uavs = domain.NUM_UAV print s return vec2id(np.zeros(uavs), np.ones(uavs) * 3)