def env_step(self, action): self.steps += 1 # Action is one of N,S,W,E action = action.charArray[0] self.step_out('ACTION:', action) if not action in self.valid_actions.keys(): print 'WARNING: Invalid action %s' % (action) obs = Observation() obs.intArray = self.world.agent_state return Reward_observation_terminal(0, obs, False) # The actions might result in movement in a direction other than the one # intended with a probability of (1 - action_prob) if self.enable_stochastic_actions: dice = random.random() if dice > self.action_prob: # Randomness! Choose uniformly between each other action other_actions = list( set(self.valid_actions.keys()) - set(action)) action = random.choice(other_actions) # Move the agent self.step_out('RESULT ACTION:', action) self.move_agent(self.valid_actions[action]) # Apply wind from the new state if self.enable_wind: pstate = self.world[self.world.agent_state[0]][ self.world.agent_state[1]] if pstate.wind: p, dir = pstate.wind dice = random.random() if dice <= p: # Fudge & crackers! Our agent gets caught by the wind! self.step_out('WIND IN %s!' % (dir)) self.move_agent(dir) agent_state = self.world.reduce_pos(self.world.agent_state) pstate = self.world[agent_state[0]][agent_state[1]] # Return observation obs = Observation() obs.intArray = self.world.agent_state #print('IT\'S A NEW WORLD:') self.step_out(self.world) #self.debug('\n' + str(self.world)) self.step_out("REWARD:", pstate.reward) terminal = pstate.terminal if self.steps > self.step_limit: self.debug("STEP LIMIT REACHED!") terminal = True return Reward_observation_terminal(pstate.reward, obs, terminal)
def test_agent_step(): print "Testing." color_range = 128 size_of_observation = 128+210*160 print "Setting up agent." agent = setup() color = 1 observation = Observation() observation.intArray = np.ones(size_of_observation, dtype=np.uint8) observation.intArray *= color agent.agent_start(observation) agent.agent_train(False) for i in range(2, 256): print "Round %d" % i reward = float(i) color = i observation = Observation() observation.intArray = np.ones(size_of_observation, dtype=np.uint8) observation.intArray *= color agent.agent_step(reward, observation) agent.agent_train(False) reward = float(i) color = i observation = Observation() observation.intArray = np.ones(size_of_observation, dtype=np.uint8) observation.intArray *= color agent.agent_step(reward, observation) agent.agent_train(True)
class test_empty_environment(Environment): whichEpisode=0 emptyObservation=Observation() nonEmptyObservation=Observation(2,4,5) def env_init(self): self.nonEmptyObservation.intArray=[0,1] self.nonEmptyObservation.doubleArray=[0.0/4.0,1.0/4.0,2.0/4.0,3.0/4.0] self.nonEmptyObservation.charArray=['a','b','c','d','e'] return "" def env_start(self): self.whichEpisode=self.whichEpisode+1 if self.whichEpisode % 2 == 0: return self.emptyObservation else: return self.nonEmptyObservation def env_step(self,action): ro=Reward_observation_terminal() if self.whichEpisode % 2 == 0: ro.o=self.emptyObservation else: ro.o=self.nonEmptyObservation return ro def env_cleanup(self): pass def env_message(self,inMessage): return None
def agent_init(self,taskSpecString): self.numActions = 4 self.numStates = 144 self.qfunction = [self.numActions*[0.0] for i in range(self.numStates)] self.lastAction=Action() self.lastObs=Observation()
def makeObservation(self): returnObs = Observation() returnObs.doubleArray = self.pos.tolist() if self.fuel_loc is not None: returnObs.doubleArray += [self.fuel] returnObs.intArray = [self.pass_loc, self.pass_dest] return returnObs
class test_1_environment(Environment): stepCount = 0 o = Observation() def env_init(self): return "sample task spec" def env_start(self): self.stepCount = 0 self.o.intArray = [1] self.o.doubleArray = [0.0 / 2.0, 1.0 / 2.0] self.o.charArray = ['a', 'b', 'c'] return self.o def env_step(self, action): ro = Reward_observation_terminal() terminal = False if self.stepCount < 5: self.o.doubleArray = [] self.o.charArray = [] self.o.intArray = [self.stepCount] self.stepCount = self.stepCount + 1 if self.stepCount == 5: terminal = True ro.r = 1.0 else: self.o.doubleArray = [ 0.0078125, -0.0078125, 0.0, 0.0078125e150, -0.0078125e150 ] self.o.charArray = ['g', 'F', '?', ' ', '&'] self.o.intArray = [173, -173, 2147483647, 0, -2147483648] ro.r = -2.0 ro.o = self.o ro.terminal = terminal return ro def env_cleanup(self): pass def env_message(self, inMessage): timesToPrint = self.stepCount % 3 outMessage = inMessage + b"|" for i in range(0, timesToPrint): outMessage = outMessage + bytes("%d" % (self.stepCount), encoding='ascii') outMessage = outMessage + b"." outMessage = outMessage + b"|" + inMessage return outMessage
def env_start(self): self.currentState = 10 returnObs = Observation() returnObs.intArray = [self.currentState] return returnObs
def env_start(self): self.reset() returnObs = Observation() returnObs.doubleArray = [ self.cart_location, self.cart_velocity ] + self.pole_angle.tolist() + self.pole_velocity.tolist() return returnObs
def env_start(self): if self.fixedStartState: stateValid = self.setAgentState(self.startRow, self.startCol) if not stateValid: print "The fixed start state was NOT valid: " + str( int(self.startRow)) + "," + str(int(self.startRow)) self.setRandomState() else: self.setRandomState() returnObs = Observation() returnObs.intArray = [self.calculateFlatState()] #Up, Right, Down, Option1, Option2 returnObs.charArray = ["T", "T", "T", "T"] if len(self.optionsArray[self.startRow][self.startCol]) != 0: for i in range(len( self.optionsArray[self.startRow][self.startCol])): returnObs.charArray[ 3 + self.optionsArray[self.startRow][self.startCol][i]] = "T" # print returnObs.charArray #Now add characters based on options present return returnObs
def env_step(self, thisAction): log = logging.getLogger('pyrl.environments.gridworld.env_step') episodeOver = 0 intAction = thisAction.intArray[0] log.debug("Action to take: %d", intAction) theReward = self.takeAction(intAction) if self.isAtGoal(): log.info("Episode completed!!") episodeOver = 1 if self.reward_noise > 0: theReward += numpy.random.normal(scale=self.reward_noise) theObs = Observation() theObs.doubleArray = self.getState() returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver log.info("(Action - State - Reward): (%d - %s - %f)", intAction, pformat(theObs), theReward) return returnRO
def agent_init(self, taskSpec): """Initialize the RL agent. Args: taskSpec: The RLGlue task specification string. """ # (Re)initialize parameters (incase they have been changed during a trial self.init_parameters() # Parse the task specification and set up the weights and such TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec) if self.agent_supported(TaskSpec): self.numStates = len(TaskSpec.getDoubleObservations()) self.discStates = numpy.array(TaskSpec.getIntObservations()) self.numDiscStates = int( reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates, 1.0)) self.numActions = TaskSpec.getIntActions()[0][1] + 1 self.model.model_init(self.numDiscStates, TaskSpec.getDoubleObservations(), \ self.numActions, TaskSpec.getRewardRange()[0]) self.planner.planner_init(self.numDiscStates, TaskSpec.getDoubleObservations(), \ self.numActions, TaskSpec.getRewardRange()[0]) else: print "Task Spec could not be parsed: " + taskSpecString self.lastAction = Action() self.lastObservation = Observation()
def env_start(self): """ Start the game! """ # Set up start states self.world.add_starts(*self.start_states) # Set up terminal states self.world.add_terminals(*self.terminal_states.keys()) for (row, col), reward in self.terminal_states.items(): self.world[row][col].reward = reward # Initialize state of the agent to one of start_states r = random.randrange(len(self.start_states)) self.world.agent_state = list(self.start_states[r]) # Initialize step counter self.steps = 0 self.step_out('START WORLD:') self.step_out(self.world) # Pass agent state over to the agent obs = Observation() obs.intArray = self.world.agent_state return obs
def env_start(self): self.setStartState() returnObs = Observation() returnObs.intArray = [ self.calculateFlatState(self.agentRow, self.agentCol) ] return returnObs
def agent_init(self, taskSpecString): TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpecString) if TaskSpec.valid: assert len(TaskSpec.getIntObservations() ) == 1, "expecting 1-dimensional discrete observations" assert len(TaskSpec.getDoubleObservations() ) == 0, "expecting no continuous observations" assert not TaskSpec.isSpecial( TaskSpec.getIntObservations()[0][0] ), " expecting min observation to be a number not a special value" assert not TaskSpec.isSpecial( TaskSpec.getIntObservations()[0][1] ), " expecting max observation to be a number not a special value" self.numStates = TaskSpec.getIntObservations()[0][1] + 1 assert len(TaskSpec.getIntActions() ) == 1, "expecting 1-dimensional discrete actions" assert len(TaskSpec.getDoubleActions() ) == 0, "expecting no continuous actions" assert not TaskSpec.isSpecial( TaskSpec.getIntActions()[0][0] ), " expecting min action to be a number not a special value" assert not TaskSpec.isSpecial( TaskSpec.getIntActions()[0][1] ), " expecting max action to be a number not a special value" self.numActions = TaskSpec.getIntActions()[0][1] + 1 self.value_function = numpy.zeros( [self.numStates, self.numActions]) else: print "Task Spec could not be parsed: " + taskSpecString self.lastAction = Action() self.lastObservation = Observation()
def env_step(self, thisAction): episodeOver = 0 theReward = 0 if thisAction.intArray[0] == 0: self.currentState = self.currentState - 1 if thisAction.intArray[0] == 1: self.currentState = self.currentState + 1 if self.currentState <= 0: self.currentState = 0 theReward = -1 episodeOver = 1 if self.currentState >= 20: self.currentState = 20 theReward = 1 episodeOver = 1 theObs = Observation() theObs.intArray = [self.currentState] returnRO = Reward_observation_terminal() returnRO.r = theReward returnRO.o = theObs returnRO.terminal = episodeOver return returnRO
def agent_init(self, taskSpecString): print "Agent Up" # print taskSpecString TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpecString) if TaskSpec.valid: print len( TaskSpec.getDoubleActions()), ": ", TaskSpec.getDoubleActions( ), '\n', len(TaskSpec.getDoubleObservations() ), ": ", TaskSpec.getDoubleObservations() assert len(TaskSpec.getIntObservations() ) == 0, "expecting no discrete observations" assert len(TaskSpec.getDoubleObservations( )) == 12, "expecting 12-dimensional continuous observations" assert len( TaskSpec.getIntActions()) == 0, "expecting no discrete actions" assert len(TaskSpec.getDoubleActions() ) == 4, "expecting 4-dimensional continuous actions" self.obs_specs = TaskSpec.getDoubleObservations() self.actions_specs = TaskSpec.getDoubleActions() # print "Observations: ",self.obs_specs # print "actions_specs:", self.actions_specs else: print "Task Spec could not be parsed: " + taskSpecString self.lastAction = Action() self.lastObservation = Observation()
def agent_init(self, spec): taskSpec = TaskSpecVRLGLUE3.TaskSpecParser(spec) if taskSpec.valid: self.num_actions = taskSpec.getIntActions()[0][1] + 1 else: raise "Invalid task spec" self.last_observation = Observation() self.batch_size = 32 # batch size for SGD self.ep_start = 1 # initial value of epsilon in epsilon-greedy exploration self.ep = self.ep_start # exploration probability self.ep_end = 0.1 # final value of epsilon in epsilon-greedy exploration self.ep_endt = 1000000 # number of frames over which epsilon is linearly annealed self.episode_qvals = [] self.all_qvals = [] self.learn_start = 0 # number of steps after which learning starts self.is_testing = False self.replay_memory = 1000000 self.phi_length = 4 # number of most recent frames for input to Q-function self.reset_after = 10000 # replace Q_hat with Q after this many steps self.step_counter = 0 self.episode_counter = 0 self.total_reward = 0 self.qvals = [] self.train_table = TransitionTable(self.phi_length, self.replay_memory, RESIZED_WIDTH, RESIZED_HEIGHT) self.test_table = TransitionTable(self.phi_length, self.phi_length, RESIZED_WIDTH, RESIZED_HEIGHT) if self.network_file is None: self.network = DeepQLearner(RESIZED_WIDTH, RESIZED_HEIGHT, self.num_actions, self.phi_length, self.batch_size) else: self.network = cPickle.load(open(self.network_file))
def agent_init(self, taskSpecString): self.numActions = 4 self.numStates = 144 self.qfunction = [ self.numActions * [0.0] for i in range(self.numStates) ] #x coordinate self.phi1 = np.array([i for i in range(12)]) #y coordinate self.phi2 = np.array([i for i in range(12)]) #self.theta = np.array([ for i in range(4)]) self.thetax = np.array([[ random.random(), random.random(), random.random(), random.random() ] for i in range(12)]) self.thetay = np.array([[ random.random(), random.random(), random.random(), random.random() ] for i in range(12)]) self.thetaxy = np.array([[[ random.random(), random.random(), random.random(), random.random() ] for i in range(12)] for j in range(12)]) self.lastAction = Action() self.lastObs = Observation()
def env_step(self, thisAction): # Make sure the action is valid assert len(thisAction.intArray) == 1, "Expected 1 integer action." assert thisAction.intArray[0] >= 0, "Expected action to be in [0,4]" assert thisAction.intArray[0] < 4, "Expected action to be in [0,4]" self.updatePosition(thisAction.intArray[0]) lastActionValue = thisAction.intArray[0] theObs = Observation() theObs.intArray = [self.calculateFlatState()] theObs.charArray = ["T", "T", "T", "T"] if len(self.optionsArray[self.agentRow][self.agentCol]) != 0: for i in range(len( self.optionsArray[self.agentRow][self.agentCol])): theObs.charArray[ 2 + self.optionsArray[self.agentRow][self.agentCol][i]] = "T" returnRO = Reward_observation_terminal() returnRO.r = self.calculateReward(lastActionValue) returnRO.o = theObs returnRO.terminal = self.checkCurrentTerminal() return returnRO
def agent_init(self, taskSpec): """Initialize the RL agent. Args: taskSpec: The RLGlue task specification string. """ # (Re)initialize parameters (incase they have been changed during a trial self.init_parameters() # Parse the task specification and set up the weights and such TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec) if not self.agent_supported(TaskSpec): print "Task Spec could not be parsed: " + taskSpecString sys.exit(1) self.numStates = len(TaskSpec.getDoubleObservations()) self.discStates = numpy.array(TaskSpec.getIntObservations()) self.numDiscStates = int( reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates, 1.0)) self.numActions = TaskSpec.getIntActions()[0][1] + 1 if self.numStates == 0: # Only discrete states self.numStates = 1 if self.fa_name != "trivial": print "Selected basis requires at least one continuous feature. Using trivial basis." self.fa_name = "trivial" # Set up the function approximation if self.fa_name == 'fourier': self.basis = fourier.FourierBasis(self.numStates, TaskSpec.getDoubleObservations(), order=self.params.setdefault( 'fourier_order', 3)) elif self.fa_name == 'rbf': num_functions = self.numStates if self.params.setdefault( 'rbf_number', 0) == 0 else self.params['rbf_number'] self.basis = rbf.RBFBasis(self.numStates, TaskSpec.getDoubleObservations(), num_functions=num_functions, beta=self.params.setdefault( 'rbf_beta', 0.9)) elif self.fa_name == 'tile': self.basis = tilecode.TileCodingBasis( self.numStates, TaskSpec.getDoubleObservations(), num_tiles=self.params.setdefault('tile_number', 100), num_weights=self.params.setdefault('tile_weights', 2048)) else: self.basis = trivial.TrivialBasis(self.numStates, TaskSpec.getDoubleObservations()) self.weights = numpy.zeros( (self.numDiscStates, self.basis.getNumBasisFunctions(), self.numActions)) self.traces = numpy.zeros(self.weights.shape) self.init_stepsize(self.weights.shape, self.params) self.lastAction = Action() self.lastObservation = Observation()
def env_start(self): log = logging.getLogger('pyrl.environments.gridworld.env_start') self.reset() log.info("Environment started") returnObs = Observation() returnObs.doubleArray = self.getState() log.debug("Observation to return: %s", pformat(returnObs)) return returnObs
def env_start(self): self.seed() self.reset() #self.seps=0 returnObs = Observation() returnObs.intArray = [self.s] return returnObs
def agent_init(self, taskSpec): """ Initializes agent. taskSpec: string Currently unused. Required by RL-Glue agent interface. """ self.lastAction = Action() self.lastObservation = Observation()
def agent_init(self,taskSpecString): TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpecString) if TaskSpec.valid: assert len(TaskSpec.getIntObservations())==1, "expecting 1-dimensional discrete observations" assert len(TaskSpec.getDoubleObservations())==0, "expecting no continuous observations" assert not TaskSpec.isSpecial(TaskSpec.getIntObservations()[0][0]), " expecting min observation to be a number not a special value" assert not TaskSpec.isSpecial(TaskSpec.getIntObservations()[0][1]), " expecting max observation to be a number not a special value" self.numStates=TaskSpec.getIntObservations()[0][1]+1; assert len(TaskSpec.getIntActions())==1, "expecting 1-dimensional discrete actions" assert len(TaskSpec.getDoubleActions())==0, "expecting no continuous actions" assert not TaskSpec.isSpecial(TaskSpec.getIntActions()[0][0]), " expecting min action to be a number not a special value" assert not TaskSpec.isSpecial(TaskSpec.getIntActions()[0][1]), " expecting max action to be a number not a special value" self.numActions=TaskSpec.getIntActions()[0][1]+1; self.episode = 0 else: print "Task Spec could not be parsed: "+taskSpecString; chimatfile = open('chi_mat.dat','r') unpickler = pickle.Unpickler(chimatfile) self.chi_mat = np.mat(unpickler.load()) # 0,1,2,3 - primitive actions, 4... - options self.value_function=[(self.chi_mat.shape[1]+self.numActions)*[0.0] for i in range(self.numStates)] self.absStateMembership = [] self.statesInAbsState = [[] for i in xrange(self.chi_mat.shape[1])] for (row_i,row) in enumerate(self.chi_mat): self.absStateMembership.append(row.argmax()) self.statesInAbsState[row.argmax()].append(row_i) #print 'Abstract state to which state belongs:' #print self.absStateMembership #print 'States in each abstract state:' #print self.statesInAbsState #This is just to get a mapping from the indices of chi_mat to the values returned by the environment validstatefile = open('valid_states.dat','r') unpickler = pickle.Unpickler(validstatefile) self.valid_states = unpickler.load() #print 'Mapping from row indices to flat state rep:' #print self.valid_states self.lastAction=Action() self.lastObservation=Observation() tmatrixfile = open('tmatrixperfect.dat','r') unpickler = pickle.Unpickler(tmatrixfile) self.t_mat = np.mat(unpickler.load()) pmatrixfile = open('pmatrixperfect.dat','r') self.p_mat = pickle.load(pmatrixfile) self.connect_mat = self.chi_mat.T*self.t_mat*self.chi_mat
def env_start(self): State = random.randint(0, 3) returnObs = Observation() #zero for all the 4 starting states self.presentCol = 0 self.presentRow = self.Start_states[State][0] returnObs.intArray = [self.rolloutstate()] return returnObs
def env_start(self): """ Get the state of the environment and return it. """ self.state = [0 for i in range(9)] #self.env_play() obs = Observation() obs.intArray = self.state return obs
def agent_init(self, taskSpec): """Initialize the RL agent. Args: taskSpec: The RLGlue task specification string. """ # (Re)initialize parameters (incase they have been changed during a trial log = logging.getLogger('pyrl.agents.sarsa_lambda.agent_init') self.init_parameters() # Parse the task specification and set up the weights and such TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec) if not self.agent_supported(TaskSpec): print "Task Spec could not be parsed: " + taskSpec sys.exit(1) self.numStates = len(TaskSpec.getDoubleObservations()) log.info("Ranges: %s", TaskSpec.getDoubleObservations()) self.discStates = numpy.array(TaskSpec.getIntObservations()) self.numDiscStates = int( reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates, 1.0)) self.numActions = TaskSpec.getIntActions()[0][1] + 1 # print "TSactions ", TaskSpec.getIntActions(), "TSObservation ", TaskSpec.getIntObservations() if self.numStates == 0: # Only discrete states self.numStates = 1 if self.fa_name != "trivial": print "Selected basis requires at least one continuous feature. Using trivial basis." self.fa_name = "trivial" # Set up the function approximation if self.fa_name == 'fourier': self.basis = fourier.FourierBasis(self.numStates, TaskSpec.getDoubleObservations(), order=self.params.setdefault( 'fourier_order', 3)) else: self.basis = trivial.TrivialBasis(self.numStates, TaskSpec.getDoubleObservations()) log.debug("Num disc states: %d", self.numDiscStates) numStates = self.basis.getNumBasisFunctions() log.debug("Num states: %d", numStates) log.debug("Num actions: %d", self.numActions) self.weights = numpy.zeros( (self.numDiscStates, numStates, self.numActions)) self.traces = numpy.zeros(self.weights.shape) self.init_stepsize(self.weights.shape, self.params) # print "Weights:", self.weights self.lastAction = Action() self.lastObservation = Observation() log.debug("Sarsa Lambda agent after initialization: %s", pformat(self.__dict__))
def getObservation(self): returnObs = Observation() features = [1.] if self.original_features: features += mdptetris.features_original() if self.dellacherie_features: features += mdptetris.features_dellacherie() returnObs.intArray = [mdptetris.current_piece()] returnObs.doubleArray = features return returnObs
def env_start(self): """ Instantiate a new :class:`PinballModel` environment :returns: The initial state :rtype: :class:`Observation` """ self.pinball = PinballModel(self.configuration) obs = Observation() obs.doubleArray = self.pinball.get_state() return obs
def env_start(self): k = random.randint(0, 1) State = random.randint(self.states[k][0], self.states[k][1]) returnObs = Observation() #zero for all the 4 starting states self.presentCol = random.randint(0, 999) #self.presentCol = 10900 self.presentRow = State returnObs.intArray = [self.rolloutstate()] return returnObs