class test_empty_agent(Agent): whichEpisode = 0 emptyAction = Action(0, 0, 0) nonEmptyAction = Action(7, 3, 1) def agent_init(self, taskSpec): self.whichEpisode = 0 self.nonEmptyAction.intArray = (0, 1, 2, 3, 4, 5, 6) self.nonEmptyAction.doubleArray = (0.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0) self.nonEmptyAction.charArray = ('a') def agent_start(self, observation): self.whichEpisode = self.whichEpisode + 1 if self.whichEpisode % 2 == 0: return self.emptyAction else: return self.nonEmptyAction def agent_step(self, reward, observation): if self.whichEpisode % 2 == 0: return self.emptyAction else: return self.nonEmptyAction def agent_end(self, reward): pass def agent_cleanup(self): pass def agent_message(self, inMessage): return ""
def agent_step(self, reward, observation): """ This method is called each time step. Arguments: reward - Real valued reward. observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # Generate random action this_int_action = self.randGenerator.randint(0, self.num_actions - 1) return_action = Action() return_action.intArray = [this_int_action] if self.show_ale: self._show_ale_color() #self._show_ale_gray() if self.saving: if self.int_states: self.states.append(self.last_observation.intArray) else: self.states.append(self.last_observation.doubleArray) self.actions.append(self.last_action.intArray[0]) self.rewards.append(reward) self.absorbs.append(False) self.last_action = copy.deepcopy(return_action) self.last_observation = copy.deepcopy(observation) return return_action
def agent_start(self, observation): """Start an episode for the RL agent. Args: observation: The first observation of the episode. Should be an RLGlue Observation object. Returns: The first action the RL agent chooses to take, represented as an RLGlue Action object. """ log = logging.getLogger('pyrl.agents.sarsa_lambda.agent_start') theState = numpy.array(list(observation.doubleArray)) thisIntAction = self.getAction(theState, self.getDiscState(observation.intArray)) returnAction = Action() returnAction.intArray = [thisIntAction] # Clear traces self.traces.fill(0.0) self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) log.debug("Action: %d", thisIntAction) log.debug("Start State: %s", theState) log.debug("Traces: %s", self.traces) return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() this_int_action = self.randGenerator.randint(0, self.num_actions-1) return_action = Action() return_action.intArray = [this_int_action] self.last_action = copy.deepcopy(return_action) self.last_img = np.array(self._resize_observation(observation.intArray)) self.last_img = self.last_img.reshape(CROPPED_WIDTH, CROPPED_HEIGHT).T return return_action
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() # this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.fprop(observation_matrix) return_action = Action() return_action.doubleArray = actions self.last_action = copy.deepcopy(actions) self.last_observation = observation.doubleArray return return_action
def agent_init(self, taskSpecString): TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpecString) if TaskSpec.valid: assert len(TaskSpec.getIntObservations() ) == 1, "expecting 1-dimensional discrete observations" assert len(TaskSpec.getDoubleObservations() ) == 0, "expecting no continuous observations" assert not TaskSpec.isSpecial( TaskSpec.getIntObservations()[0][0] ), " expecting min observation to be a number not a special value" assert not TaskSpec.isSpecial( TaskSpec.getIntObservations()[0][1] ), " expecting max observation to be a number not a special value" self.numStates = TaskSpec.getIntObservations()[0][1] + 1 assert len(TaskSpec.getIntActions() ) == 1, "expecting 1-dimensional discrete actions" assert len(TaskSpec.getDoubleActions() ) == 0, "expecting no continuous actions" assert not TaskSpec.isSpecial( TaskSpec.getIntActions()[0][0] ), " expecting min action to be a number not a special value" assert not TaskSpec.isSpecial( TaskSpec.getIntActions()[0][1] ), " expecting max action to be a number not a special value" self.numActions = TaskSpec.getIntActions()[0][1] + 1 self.value_function = numpy.zeros( [self.numStates, self.numActions]) else: print "Task Spec could not be parsed: " + taskSpecString self.lastAction = Action() self.lastObservation = Observation()
def agent_init(self, taskSpecString): print "Agent Up" # print taskSpecString TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpecString) if TaskSpec.valid: print len( TaskSpec.getDoubleActions()), ": ", TaskSpec.getDoubleActions( ), '\n', len(TaskSpec.getDoubleObservations() ), ": ", TaskSpec.getDoubleObservations() assert len(TaskSpec.getIntObservations() ) == 0, "expecting no discrete observations" assert len(TaskSpec.getDoubleObservations( )) == 12, "expecting 12-dimensional continuous observations" assert len( TaskSpec.getIntActions()) == 0, "expecting no discrete actions" assert len(TaskSpec.getDoubleActions() ) == 4, "expecting 4-dimensional continuous actions" self.obs_specs = TaskSpec.getDoubleObservations() self.actions_specs = TaskSpec.getDoubleActions() # print "Observations: ",self.obs_specs # print "actions_specs:", self.actions_specs else: print "Task Spec could not be parsed: " + taskSpecString self.lastAction = Action() self.lastObservation = Observation()
def agent_step(self, reward, observation): self.step_counter += 1 self.total_reward += reward cur_img = self.resize_image(observation.intArray) if self.is_testing: int_action = self.choose_action(self.test_table, cur_img, np.clip(reward, -1, 1), testing_ep=0.05) else: if self.step_counter % self.reset_after == 0: self.network.reset_q_hat() int_action = self.choose_action(self.train_table, cur_img, np.clip(reward, -1, 1), testing_ep=None) if self.train_table.num_entries > max(self.learn_start, self.batch_size): states, actions, rewards, next_states, terminals = self.train_table.get_minibatch( self.batch_size) loss, qvals = self.network.train(states, actions, rewards, next_states, terminals) self.losses.append(loss) self.qvals.append(np.mean(qvals)) self.batch_counter += 1 return_action = Action() return_action.intArray = [int_action] self.last_action = int_action self.last_img = cur_img return return_action
def agent_start(self, observation): if self.debug_flag: print('agent start') # stepを1増やす self.step_counter += 1 #開始時にstateをクリアしないとだめじゃない? #self.state = np.zeros((1, self.n_frames, self.bdim)).astype(np.float32) self.state = np.zeros( (1, 2, self.n_rows, self.n_cols)).astype(np.float32) # kmori: 独自のobservationを使用して、状態をアップデート。 # 一部サンプルに合わせ、残りは別の方法で作成した。 self.update_state(observation) self.update_targetQ() if self.debug_flag: print('自分が打つ手を決定する。') # 自分が打つ手を決定する。 int_action = self.select_int_action() action = Action() action.intArray = [int_action] if self.debug_flag: print('eps を更新する。') # eps を更新する。epsはランダムに○を打つ確率 self.update_eps() # state = 盤の状態 と action = ○を打つ場所 を退避する self.last_state2 = copy.deepcopy(self.last_state) # 2手前の状態 self.last_action2 = copy.deepcopy(self.last_action) # 2手前の行動 self.last_state = copy.deepcopy(self.state) self.last_action = copy.deepcopy(int_action) return action
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) newIntAction = self.getAction(newState, newDiscState) phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState) phi_tp[newDiscState, :] = self.basis.computeFeatures(newState) self.step_count += 1 self.update( phi_t, phi_tp, reward, self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp, newIntAction)) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self,Reward,Obs): new_state = Obs.intArray[0] last_state = self.lastObs.intArray[0] last_action = self.lastaction.intArray[0] Q_sa = self.qfunction[last_state][last_action] Q_saprime = self.maxim(new_state) Q_new = Q_sa + self.learningrate*( Reward + self.gamma*Q_saprime - Q_sa) #if not self.pause: self.qfunction[last_state][last_action] = Q_new #To be taken new_action = self.epsilon_greedy(new_state) returnaction = Action() returnaction.intArray = [new_action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) return returnaction
def agent_init(self,taskSpecString): self.numActions = 4 self.numStates = 144 self.qfunction = [self.numActions*[0.0] for i in range(self.numStates)] self.lastAction=Action() self.lastObs=Observation()
def agent_step(self,Reward,Obs): new_state = Obs.intArray[0] last_state = self.lastObs.intArray[0] last_action = self.lastaction.intArray[0] new_action = self.epsilon_greedy(new_state) Q_sa = self.qfunction[last_state][last_action] Q_saprime = self.qfunction[new_state][new_action] delta = Reward + self.gamma*Q_saprime - Q_sa self.efunction[last_state][last_action] = self.efunction[last_state][last_action] + 1 self.qfunction = np.array(self.qfunction) self.efunction = np.array(self.efunction) self.qfunction = self.qfunction + self.learningrate*delta*self.efunction self.efunction = self.gamma*self.lamda*self.efunction returnaction = Action() returnaction.intArray = [new_action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) return returnaction
def agent_init(self, taskSpec): # taskspec check TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec) if TaskSpec.valid: assert len(TaskSpec.getDoubleObservations())>0, "expecting at least one continuous observation" self.state_range = np.asarray(TaskSpec.getDoubleObservations()) # Check action form, and then set number of actions assert len(TaskSpec.getIntActions())==0, "expecting no discrete actions" assert len(TaskSpec.getDoubleActions())==2, "expecting 1-dimensional continuous actions" else: print "Task Spec could not be parsed" self.lbounds=[] self.ubounds=[] for r in self.state_range: self.lbounds.append(r[0]) self.ubounds.append(r[1]) self.lbounds = np.array(self.lbounds) self.ubounds = np.array(self.ubounds) # Some initializations for rlglue self.lastAction = Action() self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate # Pick a DQN from DQN_class self.DQN = DQN_class()
def agent_init(self, taskSpecString): self.numActions = 4 self.numStates = 144 self.qfunction = [ self.numActions * [0.0] for i in range(self.numStates) ] #x coordinate self.phi1 = np.array([i for i in range(12)]) #y coordinate self.phi2 = np.array([i for i in range(12)]) #self.theta = np.array([ for i in range(4)]) self.thetax = np.array([[ random.random(), random.random(), random.random(), random.random() ] for i in range(12)]) self.thetay = np.array([[ random.random(), random.random(), random.random(), random.random() ] for i in range(12)]) self.thetaxy = np.array([[[ random.random(), random.random(), random.random(), random.random() ] for i in range(12)] for j in range(12)]) self.lastAction = Action() self.lastObs = Observation()
def agent_step(self, reward, observation): # Preproces tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling obs_processed = np.maximum( obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0 / 10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % ( self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action from e-greedy action selection returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, eps) returnAction.intArray = [action] # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) self.DQN.experienceReplay(self.time) # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % ( self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) # Updates for next step self.last_observation = obs_array # Update for next step if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def agent_step(self, reward, obs): """ This function is called by the environment while the episode lasts. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) if not self.finished_learning: self.intraoption_update(reward, current_features, observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.predict(observation_matrix) return_action = Action() return_action.doubleArray = actions self.last_action = copy.deepcopy(actions) self.last_state = np.asmatrix(observation.doubleArray, dtype=floatX) return return_action
def agent_start(self, observation): # Get intensity from current observation array tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling # Initialize State self.state = np.zeros((4, 84, 84), dtype=np.uint8) self.state[0] = obs_array state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Generate an Action e-greedy returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, self.epsilon) returnAction.intArray = [action] # Update for next step self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.last_observation = obs_array return returnAction
def agent_step(self, reward, observation): action = None self.window.erase() self.window.addstr('STATE: %s\n' % (observation.intArray)) self.window.addstr('REWARD: %s\n' % (reward)) self.window.addstr('HIT UP, DOWN, LEFT or RIGHT to move...\n') self.window.refresh() try: c = self.window.getch() if c == curses.KEY_UP: action = 'N' elif c == curses.KEY_DOWN: action = 'S' elif c == curses.KEY_LEFT: action = 'W' elif c == curses.KEY_RIGHT: action = 'E' self.window.refresh() except KeyboardInterrupt: RLGlue.RL_cleanup() a = Action() if action: a.charArray = [action] return a
def do_step(self, state, reward=None): """ Runs the actual learning algorithm. In a separate function so it can be called both on start and on step. """ #self.debug('do_step(', state, ',', reward, ')') #if not state in self.Q: # State not yet visited, initialize randomly # self.Q[state] = self.random_actions() # Run the Q update if this isn't the first step action = None if reward is not None: action = self.update_Q(self.last_state, self.last_action, reward, state) # Action object a_obj = Action() if action is None: # Query the policy to find the best action action = self.policy(state) a_obj.charArray = list(action) # Save the current state-action pair for the next step's Q update. self.last_state = state self.last_action = action # And we're done return a_obj
def _select_action(self, phi=None): """ Utility function for selecting an action. phi: ndarray Memory from which action should be selected. """ if self.action_count % self.k == 0: if (np.random.rand() > self.epsilon) and phi: # Get action from Q-function phi = np.array(phi)[:, :, :, None] action_int = self.action_func(phi)[0] else: # Get random action action_int = np.random.randint(0, len(self.action_map)) self.action_log[action_int] += 1 self.cmd = [0]*len(self.action_map) self.cmd[action_int] = 1 # Map cmd to ALE action # 18 is the number of commands ALE accepts action = Action() action.intArray = [self.action_map[action_int]] self.action = action
def agent_step(self, reward, observation): observed_screen = self.preprocess_screen(observation) self.state = np.roll(self.state, 1, axis=0) self.state[0] = observed_screen ########################### DEBUG ############################### # if self.total_time_step % 500 == 0 and self.total_time_step != 0: # self.dump_state() self.learn(reward) return_action = Action() q_max = None q_min = None if self.time_step % config.rl_action_repeat == 0: action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate) else: action = self.last_action.intArray[0] return_action.intArray = [action] self.dump_result(reward, q_max, q_min) if self.policy_frozen is False: self.last_action = copy.deepcopy(return_action) self.last_state = self.state self.time_step += 1 self.total_time_step += 1 return return_action
def agent_step(self, reward, observation): newState = observation.intArray[0] lastState = self.lastObservation.intArray[0] lastAction = self.lastAction.intArray[0] Q_sa = self.value_function[lastState][lastAction] Q_sprime_aprime = -500000 for a in range(self.numberOfActions): if self.value_function[newState][a] > Q_sprime_aprime: Q_sprime_aprime = self.value_function[newState][a] #updating Q function new_Q_sa = Q_sa + self.sarsa_stepsize * ( reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) newIntAction = self.egreedy(newState) if not self.policyFrozen: self.value_function[lastState][lastAction] = new_Q_sa returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_init(self, taskSpec): """Initialize the RL agent. Args: taskSpec: The RLGlue task specification string. """ # (Re)initialize parameters (incase they have been changed during a trial self.init_parameters() # Parse the task specification and set up the weights and such TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec) if self.agent_supported(TaskSpec): self.numStates = len(TaskSpec.getDoubleObservations()) self.discStates = numpy.array(TaskSpec.getIntObservations()) self.numDiscStates = int( reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates, 1.0)) self.numActions = TaskSpec.getIntActions()[0][1] + 1 self.model.model_init(self.numDiscStates, TaskSpec.getDoubleObservations(), \ self.numActions, TaskSpec.getRewardRange()[0]) self.planner.planner_init(self.numDiscStates, TaskSpec.getDoubleObservations(), \ self.numActions, TaskSpec.getRewardRange()[0]) else: print "Task Spec could not be parsed: " + taskSpecString self.lastAction = Action() self.lastObservation = Observation()
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) phi_t = numpy.zeros((self.numStates + 1, )) phi_tp = numpy.zeros((self.numStates + 1, )) phi_t[0] = lastDiscState phi_t[1:] = lastState phi_tp[0] = newDiscState phi_tp[1:] = newState #print ','.join(map(str, lastState)) self.planner.updateExperience(phi_t, lastAction, phi_tp, reward) newIntAction = self.getAction(newState, newDiscState) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): # ステップを1増加 self.step_counter += 1 self.update_state(observation) self.update_targetQ() # 自分が打つ手を決定する。 int_action = self.select_int_action() # 戻り値が -1 ならパス。 action = Action() action.intArray = [int_action] self.reward = reward # epsを更新 self.update_eps() # データを保存 (状態、アクション、報酬、結果) self.store_transition(terminal=False) if not self.frozen: # 学習実行 if self.step_counter > self.learn_start: self.replay_experience() self.last_state = copy.deepcopy(self.state) self.last_action = copy.deepcopy(int_action) # ○の位置をエージェントへ渡す return action
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) self.update_traces(phi_t, None) self.update(phi_t, newState, newDiscState, reward) # QLearning can choose action after update newIntAction = self.getAction(newState, newDiscState) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_init(self, taskSpec): """Initialize the RL agent. Args: taskSpec: The RLGlue task specification string. """ # (Re)initialize parameters (incase they have been changed during a trial self.init_parameters() # Parse the task specification and set up the weights and such TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec) if not self.agent_supported(TaskSpec): print "Task Spec could not be parsed: " + taskSpecString sys.exit(1) self.numStates = len(TaskSpec.getDoubleObservations()) self.discStates = numpy.array(TaskSpec.getIntObservations()) self.numDiscStates = int( reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates, 1.0)) self.numActions = TaskSpec.getIntActions()[0][1] + 1 if self.numStates == 0: # Only discrete states self.numStates = 1 if self.fa_name != "trivial": print "Selected basis requires at least one continuous feature. Using trivial basis." self.fa_name = "trivial" # Set up the function approximation if self.fa_name == 'fourier': self.basis = fourier.FourierBasis(self.numStates, TaskSpec.getDoubleObservations(), order=self.params.setdefault( 'fourier_order', 3)) elif self.fa_name == 'rbf': num_functions = self.numStates if self.params.setdefault( 'rbf_number', 0) == 0 else self.params['rbf_number'] self.basis = rbf.RBFBasis(self.numStates, TaskSpec.getDoubleObservations(), num_functions=num_functions, beta=self.params.setdefault( 'rbf_beta', 0.9)) elif self.fa_name == 'tile': self.basis = tilecode.TileCodingBasis( self.numStates, TaskSpec.getDoubleObservations(), num_tiles=self.params.setdefault('tile_number', 100), num_weights=self.params.setdefault('tile_weights', 2048)) else: self.basis = trivial.TrivialBasis(self.numStates, TaskSpec.getDoubleObservations()) self.weights = numpy.zeros( (self.numDiscStates, self.basis.getNumBasisFunctions(), self.numActions)) self.traces = numpy.zeros(self.weights.shape) self.init_stepsize(self.weights.shape, self.params) self.lastAction = Action() self.lastObservation = Observation()
def agent_step(self, reward, observation): self.stepCount = self.stepCount + 1 action = Action() action.intArray = observation.intArray action.doubleArray = observation.doubleArray action.charArray = observation.charArray return action