def agent_step(self, reward, observation): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray lastStateId = SamplingUtility.getStateId(lastState) lastActionIdx = self.all_allowed_actions[lastStateId].index(tuple(lastAction)) if reward == self.Bad_Action_Penalty: self.all_allowed_actions[lastStateId].pop(lastActionIdx) self.Q_value_function[lastStateId].pop(lastActionIdx) newAction = self.egreedy(self.lastObservation.intArray) returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) return returnAction newState = observation.intArray newAction = self.egreedy(newState) if type(newAction) is tuple: newAction = list(newAction) Q_sa = self.Q_value_function[lastStateId][lastActionIdx] Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(newState)][ self.all_allowed_actions[SamplingUtility.getStateId(newState)].index(tuple(newAction))] new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): ''' called by the rl-glue ''' action = Action() #eagle mode if self.agenda == self.EAGLE: # combine observation with partial nodes and enqueue self.successorStates = self.updateWorkingNodeSet(self.partialStateNodes, observation) self.enqueue(self.successorStates) if self.heapQueue.empty(): # In case Iterative Deepening it should check if it exceeded number of iterations or not if self.strategyIndex==2 and self.iteration <= self.MAX_DEPTH: # if ID self.iteration+=1 action.charArray.append('q') action.intArray = [1, 0, 0] initPartialNode = Node() self.partialStateNodes = [initPartialNode] self.heapQueue = Queue.LifoQueue() self.visited.fill(False) return action # if not ID print 'fail' print 'ellapsed time:', time.time()-self.startTime,'s' action.intArray = [] action.charArray.append('x') action.intArray = [] return action #Get first element from list first=self.heapQueue.get()[1] self.numExpandedNodes+=1 #for debug self.depthq.append(first.depth) self.setVisited(first) # if reaching goal just send dummy action and change the mode if self.goal(first): self.pathToGoal = self.createPathToGoal(first) self.agenda = self.AGENT print self.pathToGoal,'number of steps',len(self.pathToGoal) print 'number of expanded nodes:',self.numExpandedNodes print 'ellapsed time:', time.time()-self.startTime,'s' #print max(self.depthq) action.charArray.append('.') action.intArray = [] return action self.partialStateNodes = self.getSuccessorStates(first) return self.getCellsNeededForDiscovery(first) # Agent mode just send actions if self.agenda == self.AGENT: self.pathToGoalIndex = self.pathToGoalIndex + 1 action.charArray.append(self.pathToGoal[self.pathToGoalIndex]) action.intArray = [] return action
def agent_step(self,Reward,Obs): new_state = Obs.intArray[0] last_state = self.lastObs.intArray[0] last_action = self.lastaction.intArray[0] new_action = self.epsilon_greedy(new_state) Q_sa = self.qfunction[last_state][last_action] Q_saprime = self.qfunction[new_state][new_action] delta = Reward + self.gamma*Q_saprime - Q_sa self.efunction[last_state][last_action] = self.efunction[last_state][last_action] + 1 self.qfunction = np.array(self.qfunction) self.efunction = np.array(self.efunction) self.qfunction = self.qfunction + self.learningrate*delta*self.efunction self.efunction = self.gamma*self.lamda*self.efunction returnaction = Action() returnaction.intArray = [new_action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) return returnaction
def agent_step(self,reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) phi_t = numpy.zeros((self.numStates+1,)) phi_tp = numpy.zeros((self.numStates+1,)) phi_t[0] = lastDiscState phi_t[1:] = lastState phi_tp[0] = newDiscState phi_tp[1:] = newState #print ','.join(map(str, lastState)) self.planner.updateExperience(phi_t, lastAction, phi_tp, reward) newIntAction = self.getAction(newState, newDiscState) returnAction=Action() returnAction.intArray=[newIntAction] self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def agent_step(self, reward, obs): """ This function is called by the environment while the episode lasts. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) if not self.finished_learning: self.intraoption_update(reward, current_features, observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action
def agent_step(self, reward, observation): """ This method is called each time step. Arguments: reward - Real valued reward. observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # Generate random action this_int_action=self.randGenerator.randint(0,self.num_actions-1) return_action=Action() return_action.intArray=[this_int_action] if self.show_ale: self._show_ale_color() #self._show_ale_gray() if self.saving: if self.int_states: self.states.append(self.last_observation.intArray) else: self.states.append(self.last_observation.doubleArray) self.actions.append(self.last_action.intArray[0]) self.rewards.append(reward) self.absorbs.append(False) self.last_action=copy.deepcopy(return_action) self.last_observation=copy.deepcopy(observation) return return_action
def agent_step(self,Reward,Obs): new_state = Obs.intArray[0] last_state = self.lastObs.intArray[0] last_action = self.lastaction.intArray[0] Q_sa = self.qfunction[last_state][last_action] Q_saprime = self.maxim(new_state) Q_new = Q_sa + self.learningrate*( Reward + self.gamma*Q_saprime - Q_sa) #if not self.pause: self.qfunction[last_state][last_action] = Q_new #To be taken new_action = self.epsilon_greedy(new_state) returnaction = Action() returnaction.intArray = [new_action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) return returnaction
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[lastDiscState, :, lastAction] = self.basis.computeFeatures(lastState) self.update_traces(phi_t, None) self.update(phi_t, newState, newDiscState, reward) # QLearning can choose action after update newIntAction = self.getAction(newState, newDiscState) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() this_int_action = self.randGenerator.randint(0, self.num_actions-1) return_action = Action() return_action.intArray = [this_int_action] self.last_action = copy.deepcopy(return_action) self.last_img = self._resize_observation(observation.intArray) return return_action
def agent_step(self, reward, observation): # ステップを1増加 self.step_counter += 1 self.update_state(observation) self.update_targetQ() # 自分が打つ手を決定する。 int_action = self.select_int_action() # 戻り値が -1 ならパス。 action = Action() action.intArray = [int_action] self.reward = reward # epsを更新 self.update_eps() # データを保存 (状態、アクション、報酬、結果) self.store_transition(terminal=False) if not self.frozen: # 学習実行 if self.step_counter > self.learn_start: self.replay_experience() self.last_state = copy.deepcopy(self.state) self.last_action = copy.deepcopy(int_action) # ○の位置をエージェントへ渡す return action
def agent_step(self, reward, observation): self.step_counter += 1 self.total_reward += reward cur_img = self.resize_image(observation.intArray) if self.is_testing: int_action = self.choose_action(self.test_table, cur_img, np.clip(reward, -1, 1), testing_ep=0.05) else: if self.step_counter % self.reset_after == 0: self.network.reset_q_hat() int_action = self.choose_action(self.train_table, cur_img, np.clip(reward, -1, 1), testing_ep=None) if self.train_table.num_entries > max(self.learn_start, self.batch_size): states, actions, rewards, next_states, terminals = self.train_table.get_minibatch( self.batch_size) loss, qvals = self.network.train(states, actions, rewards, next_states, terminals) self.losses.append(loss) self.qvals.append(np.mean(qvals)) self.batch_counter += 1 return_action = Action() return_action.intArray = [int_action] self.last_action = int_action self.last_img = cur_img return return_action
def agent_start(self, observation): ''' initialize the episode strategy ''' #Generate action, query 0,0 action = Action() action.charArray.append('q') action.intArray = [1, 0, 0] # increment strategy self.strategyIndex += 1 #add 1st node (0,0) and North with arrow to the partial nodes initPartialNode = Node() self.partialStateNodes = [initPartialNode] #initialize new queue according to strategy self.newQueu() #set the agenda self.agenda = self.EAGLE #reset the pointer to the action path self.pathToGoalIndex = -1 self.visited.fill(False) self.depthq=[] # to measure performance self.numExpandedNodes=0 self.startTime=time.time() #print 'End the method start' return action
def _select_action(self, phi=None): """ Utility function for selecting an action. phi: ndarray Memory from which action should be selected. """ if self.action_count % self.k == 0: if (np.random.rand() > self.epsilon) and phi: # Get action from Q-function phi = np.array(phi)[:, :, :, None] action_int = self.action_func(phi)[0] else: # Get random action action_int = np.random.randint(0, len(self.action_map)) self.action_log[action_int] += 1 self.cmd = [0]*len(self.action_map) self.cmd[action_int] = 1 # Map cmd to ALE action # 18 is the number of commands ALE accepts action = Action() action.intArray = [self.action_map[action_int]] self.action = action
def agent_start(self, observation): if self.debug_flag: print('agent start') # stepを1増やす self.step_counter += 1 #開始時にstateをクリアしないとだめじゃない? #self.state = np.zeros((1, self.n_frames, self.bdim)).astype(np.float32) self.state = np.zeros( (1, 2, self.n_rows, self.n_cols)).astype(np.float32) # kmori: 独自のobservationを使用して、状態をアップデート。 # 一部サンプルに合わせ、残りは別の方法で作成した。 self.update_state(observation) self.update_targetQ() if self.debug_flag: print('自分が打つ手を決定する。') # 自分が打つ手を決定する。 int_action = self.select_int_action() action = Action() action.intArray = [int_action] if self.debug_flag: print('eps を更新する。') # eps を更新する。epsはランダムに○を打つ確率 self.update_eps() # state = 盤の状態 と action = ○を打つ場所 を退避する self.last_state2 = copy.deepcopy(self.last_state) # 2手前の状態 self.last_action2 = copy.deepcopy(self.last_action) # 2手前の行動 self.last_state = copy.deepcopy(self.state) self.last_action = copy.deepcopy(int_action) return action
def agent_step(self,reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) newIntAction = self.getAction(newState, newDiscState) phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState) phi_tp[newDiscState, :] = self.basis.computeFeatures(newState) self.step_count += 1 self.update(phi_t, phi_tp, reward, self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp, newIntAction)) returnAction=Action() returnAction.intArray=[newIntAction] self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) newIntAction = self.getAction(newState, newDiscState) phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1])) phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState) phi_tp[newDiscState, :] = self.basis.computeFeatures(newState) self.step_count += 1 self.update( phi_t, phi_tp, reward, self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp, newIntAction)) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_start(self, observation): #Generate random action, 0 or 1 return_action = Action() return_action.intArray = [] for i in xrange(0,self.action_size): return_action.intArray += [self.rng.randint(self.action_bounds[i][0],self.action_bounds[i][1])] return return_action
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ newState = numpy.array(list(observation.doubleArray)) lastState = numpy.array(list(self.lastObservation.doubleArray)) lastAction = self.lastAction.intArray[0] newDiscState = self.getDiscState(observation.intArray) lastDiscState = self.getDiscState(self.lastObservation.intArray) phi_t = numpy.zeros((self.numStates + 1, )) phi_tp = numpy.zeros((self.numStates + 1, )) phi_t[0] = lastDiscState phi_t[1:] = lastState phi_tp[0] = newDiscState phi_tp[1:] = newState #print ','.join(map(str, lastState)) self.planner.updateExperience(phi_t, lastAction, phi_tp, reward) newIntAction = self.getAction(newState, newDiscState) returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): self.step_counter += 1 self.total_reward += reward cur_img = self.resize_image(observation.intArray) if self.is_testing: int_action = self.choose_action(self.test_table, cur_img, np.clip(reward, -1, 1), testing_ep=0.05) else: if self.step_counter % self.reset_after == 0: self.network.reset_q_hat() int_action = self.choose_action(self.train_table, cur_img, np.clip(reward, -1, 1), testing_ep=None) if self.train_table.num_entries > max(self.learn_start, self.batch_size): states, actions, rewards, next_states, terminals = self.train_table.get_minibatch(self.batch_size) loss, qvals = self.network.train(states, actions, rewards, next_states, terminals) self.losses.append(loss) self.qvals.append(np.mean(qvals)) self.batch_counter += 1 return_action = Action() return_action.intArray = [int_action] self.last_action = int_action self.last_img = cur_img return return_action
def agent_step(self, reward, observation): """ This method is called each time step. Arguments: reward - Real valued reward. observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # Generate random action this_int_action = self.randGenerator.randint(0, self.num_actions - 1) return_action = Action() return_action.intArray = [this_int_action] if self.show_ale: self._show_ale_color() #self._show_ale_gray() if self.saving: if self.int_states: self.states.append(self.last_observation.intArray) else: self.states.append(self.last_observation.doubleArray) self.actions.append(self.last_action.intArray[0]) self.rewards.append(reward) self.absorbs.append(False) self.last_action = copy.deepcopy(return_action) self.last_observation = copy.deepcopy(observation) return return_action
def agent_start(self, observation): """Start an episode for the RL agent. Args: observation: The first observation of the episode. Should be an RLGlue Observation object. Returns: The first action the RL agent chooses to take, represented as an RLGlue Action object. """ log = logging.getLogger('pyrl.agents.sarsa_lambda.agent_start') theState = numpy.array(list(observation.doubleArray)) thisIntAction = self.getAction(theState, self.getDiscState(observation.intArray)) returnAction = Action() returnAction.intArray = [thisIntAction] # Clear traces self.traces.fill(0.0) self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) log.debug("Action: %d", thisIntAction) log.debug("Start State: %s", theState) log.debug("Traces: %s", self.traces) return returnAction
def agent_step(self, reward, observation): """Take one step in an episode for the agent, as the result of taking the last action. Args: reward: The reward received for taking the last action from the previous state. observation: The next observation of the episode, which is the consequence of taking the previous action. Returns: The next action the RL agent chooses to take, represented as an RLGlue Action object. """ new_state = numpy.array(list(observation.doubleArray)) last_state = numpy.array(list(self.last_observation.doubleArray)) last_action = self.last_action.intArray[0] new_disc_state = self.getDiscState(observation.intArray) last_disc_state = self.getDiscState(self.last_observation.intArray) # Update eligibility traces phi_t = numpy.zeros(self.traces.shape) phi_t[last_disc_state, :, last_action] = self.basis.computeFeatures(last_state) self.update_traces(phi_t, None) self.update(phi_t, new_state, new_disc_state, reward) # QLearning can choose action after update new_int_action = self.getAction(new_state, new_disc_state) return_action = Action() return_action.intArray = [new_int_action] self.last_action = copy.deepcopy(return_action) self.last_observation = copy.deepcopy(observation) return return_action
def create_action(self, action): if np.isscalar(action): action = np.array([action]) return_action = Action() return_action.intArray = [ action[:self.learner.dim_action()].astype(int)] return return_action
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() this_int_action = self.randGenerator.randint(0, self.num_actions-1) return_action = Action() return_action.intArray = [this_int_action] self.last_action = copy.deepcopy(return_action) self.last_img = np.array(self._resize_observation(observation.intArray)) self.last_img = self.last_img.reshape(CROPPED_WIDTH, CROPPED_HEIGHT).T return return_action
def agent_start(self, observation): # Get intensity from current observation array tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling # Initialize State self.state = np.zeros((4, 84, 84), dtype=np.uint8) self.state[0] = obs_array state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Generate an Action e-greedy returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, self.epsilon) returnAction.intArray = [action] # Update for next step self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.last_observation = obs_array return returnAction
def agent_step(self, reward, observation): # Preproces tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling obs_processed = np.maximum( obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0 / 10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % ( self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action from e-greedy action selection returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, eps) returnAction.intArray = [action] # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) self.DQN.experienceReplay(self.time) # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % ( self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) # Updates for next step self.last_observation = obs_array # Update for next step if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def agent_step(self, reward, observation): newState = observation.intArray[0] lastState = self.lastObservation.intArray[0] lastAction = self.lastAction.intArray[0] Q_sa = self.value_function[lastState][lastAction] Q_sprime_aprime = -500000 for a in range(self.numberOfActions): if self.value_function[newState][a] > Q_sprime_aprime: Q_sprime_aprime = self.value_function[newState][a] #updating Q function new_Q_sa = Q_sa + self.sarsa_stepsize * ( reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) newIntAction = self.egreedy(newState) if not self.policyFrozen: self.value_function[lastState][lastAction] = new_Q_sa returnAction = Action() returnAction.intArray = [newIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, obs): """ This function is called by the environment while the episode lasts. If learning is not frozen, the option-value function Q(s, o) is updated using intra-option learning. :param reward: The reward obtained as a result of the last transition. :param obs: An observation from the environment :rtype obs: :class:`rlglue.types.Observation` :returns: The primitive action to execute in the environment according to the behavior policy. :rtype: a primitive action under the form of a :class:`rlglue.types.Action` """ observation = np.array(obs.doubleArray) current_features = self.basis.computeFeatures(observation) if not self.finished_learning: self.intraoption_update(reward, current_features, observation) self.last_observation = observation self.last_features = current_features self.last_action = self.mu(observation, current_features).pi(observation) action = Action() action.intArray = [self.last_action] return action
def agent_step(self, reward, observation): observed_screen = self.preprocess_screen(observation) self.state = np.roll(self.state, 1, axis=0) self.state[0] = observed_screen ########################### DEBUG ############################### # if self.total_time_step % 500 == 0 and self.total_time_step != 0: # self.dump_state() self.learn(reward) return_action = Action() q_max = None q_min = None if self.time_step % config.rl_action_repeat == 0: action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate) else: action = self.last_action.intArray[0] return_action.intArray = [action] self.dump_result(reward, q_max, q_min) if self.policy_frozen is False: self.last_action = copy.deepcopy(return_action) self.last_state = self.state self.time_step += 1 self.total_time_step += 1 return return_action
def agent_step(self, reward, observation): self.stepCount = self.stepCount + 1 action = Action() action.intArray = observation.intArray action.doubleArray = observation.doubleArray action.charArray = observation.charArray return action
def agent_start(self, observation): self.stepCount = 0 action = Action() action.intArray = observation.intArray action.doubleArray = observation.doubleArray action.charArray = observation.charArray return action
def agent_step(self,reward, observation): self.stepCount=self.stepCount+1 action=Action() action.intArray=observation.intArray action.doubleArray=observation.doubleArray action.charArray=observation.charArray return action
def agent_start(self,observation): self.stepCount=0 action=Action() action.intArray=observation.intArray action.doubleArray=observation.doubleArray action.charArray=observation.charArray return action
def _getAction(self): """ Return a RLGlue action that is made out of a numpy array yielded by the hold pybrain agent. """ action = RLGlueAction() action.doubleArray = self.agent.getAction().tolist() action.intArray = [] return action
def agent_step(self, reward, observation): # Preproces tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0/10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action by e-greedy action selection returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, eps) returnAction.intArray = [action] # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) self.DQN.experienceReplay(self.time) # Target model update if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: print "########### MODEL UPDATED ######################" self.DQN.target_model_update() np.save('params/l1_W.npy',self.DQN.CNN_model.l1.W.get()) np.save('params/l1_b.npy',self.DQN.CNN_model.l1.b.get()) np.save('params/l2_W.npy',self.DQN.CNN_model.l2.W.get()) np.save('params/l2_b.npy',self.DQN.CNN_model.l2.b.get()) np.save('params/l3_W.npy',self.DQN.CNN_model.l3.W.get()) np.save('params/l3_b.npy',self.DQN.CNN_model.l3.b.get()) # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) # Updates for next step self.last_observation = obs_array if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def agent_start(self, observation): thisIntAction=self.randGenerator.randint(0,1) returnAction=Action() returnAction.intArray=[thisIntAction] lastAction=copy.deepcopy(returnAction) lastObservation=copy.deepcopy(observation) return returnAction
def getCellsNeededForDiscovery(self, node): ''' Generate list I contains locations to discover and send action with them ''' newPosition = self.newPosition(node.state.position, node.state.orintation)[1] action = Action() action.intArray = [1, newPosition[0], newPosition[1]] action.charArray.append('q') return action
def _getAction(self): """ Return a RLGlue action that is made out of a numpy array yielded by the hold pybrain agent. """ action = RLGlueAction() action.doubleArray = self.agent.getAction().tolist() action.intArray = [] return action
def agent_step(self, reward, observation): global save_flg2 # Preproces tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling # 前回の結果を重ねたものを使用する obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames with elementwise # Compose State : 4-step sequential observation self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) # idx:0 を飛ばしてずらす state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0/10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action by e-greedy action selection returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, eps) returnAction.intArray = [action] # Learning Phase loss_val = 0 if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False) loss_val = self.DQN.experienceReplay(self.time) # Target model update if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: print "########### MODEL UPDATED ######################" self.DQN.target_model_update() # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now)) logger.info("{},{},{},{},{},{}".format(dt.now().strftime("%Y-%m-%d_%H:%M:%S"), \ self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now))) # Updates for next step self.last_observation = obs_array if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def agent_start(self,observation): self.optionCurrentlyOn = False theState=observation.intArray[0] s = self.valid_states.index(theState) # row index # Choose either a primitive action or option a = self.egreedy(s) if a<self.numActions: # Primitive action thisIntAction = a self.optionCurrentlyOn = False print 'Primitive action chosen' else: # Composing an option from S_i to S_j self.optionCurrentlyOn = True self.currentOptionTime = 0 self.curentOptionStartState = s self.currentOptionReward = 0.0 # 1. Find the abstract state you belong to & going to self.option_S_i = self.absStateMembership[s] # initiation step self.option_S_j = a-self.numActions # actually, we will have to choose S_j based on SMDP #print 'Shape of first term: ',self.p_mat[s][0].shape #print self.option_S_j #print 'Shape of second term: ', (self.chi_mat.T[self.option_S_j]).T.shape #print 'Debug:' #print self.chi_mat[0,0] # 2. Choose action based on membership ascent thisIntAction=1 maxVal = 0 for a in xrange(4): print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0) action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0) if action_pref > maxVal: thisIntAction = a maxVal = action_pref print 'Option chosen' self.currentOptionTime += 1 print 'Action chosen: ',thisIntAction returnAction=Action() returnAction.intArray=[thisIntAction] self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) self.episode += 1 return returnAction
def agent_start(self, observation): #Generate random action, 0 or 1 thisIntAction = self.randGenerator.randint(0, 1) returnAction = Action() returnAction.intArray = [thisIntAction] lastAction = copy.deepcopy(returnAction) lastObservation = copy.deepcopy(observation) return returnAction
def agent_start(self, observation): newState = observation.intArray[0] x = self.egreedy(newState) returnAction = Action() returnAction.intArray = [x] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self,reward, observation): #Generate random action, 0 or 1 thisIntAction=self.randGenerator.randint(0,1) returnAction=Action() returnAction.intArray=[thisIntAction] last_action=copy.deepcopy(returnAction) last_observation=copy.deepcopy(observation) return returnAction
def agent_start(self,observation): theState=observation.intArray[0]; thisIntAction=self.egreedy(theState); returnAction=Action() returnAction.intArray=[thisIntAction] self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def agent_start(self, Obs): State = Obs.intArray[0] action = self.epsilon_greedy(State) returnaction = Action() returnaction.intArray = [action] self.lastaction = copy.deepcopy(returnaction) self.lastObs = copy.deepcopy(Obs) #might need to return something return returnaction
def agent_start(self, observation): theState = observation.intArray[0] thisIntAction = self.egreedy(theState) returnAction = Action() returnAction.intArray = [thisIntAction] self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray lastStateId = SamplingUtility.getStateId(lastState) lastActionIdx = self.all_allowed_actions[lastStateId].index( tuple(lastAction)) if reward == self.Bad_Action_Penalty: self.all_allowed_actions[lastStateId].pop(lastActionIdx) self.Q_value_function[lastStateId].pop(lastActionIdx) newAction = self.egreedy(self.lastObservation.intArray) returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) return returnAction newState = observation.intArray newAction = self.egreedy( newState) #for random player, egreedy=random_player if type(newAction) is tuple: newAction = list(newAction) #print newAction #we kept the same names from sarsa because it was a bit convenient ---> test test sarsa again, just replace max(blah,blah), with Q_sprime_aprime and uncomment the code below Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId( newState)][self.all_allowed_actions[SamplingUtility.getStateId( newState)].index(tuple(newAction))] #------>comment lines 133-139 when you want random player Q_sa = self.Q_value_function[lastStateId][lastActionIdx] new_Q_sa = Q_sa + self.stepsize * ( reward + self.discount * Q_sprime_aprime - Q_sa) if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId( lastState)].index(tuple(lastAction))] = new_Q_sa #------>comment lines<----- returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): screen = observation.intArray[128:] screen = np.reshape(screen, (210, -1)) self.image.new_image(screen) return_action = Action() action = randrange(self.numActions) return_action.intArray = [action] self.lastAction=copy.deepcopy(return_action) self.lastObservation=copy.deepcopy(observation) return return_action
def agent_start(self, observation): screen = observation.intArray[128:] screen = np.reshape(screen, (210, -1)) maze = detect_maze(screen) self.image = pacman_image(maze) return_action = Action() action = randrange(self.numActions) return_action.intArray = [action] self.lastAction = copy.deepcopy(return_action) self.lastObservation = copy.deepcopy(observation) return return_action
def agent_start(self, observation): theState = observation.intArray thisIntAction = self.egreedy(theState) if type(thisIntAction) is tuple: thisIntAction = list(thisIntAction) returnAction = Action() returnAction.intArray = thisIntAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_start(self, observation): this_int_action = np.random.randint(0, self.num_actions) return_action = Action() return_action.intArray = [this_int_action] self.start_time = time.time() self.batch_counter = 0 self.last_action = 0 self.losses = [] self.last_img = self.resize_image(observation.intArray) return return_action
def agent_step(self, reward, observation): """ This method is called each time step. Arguments: reward - Real valued reward. observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter += 1 return_action = Action() cur_img = self._resize_observation(observation.intArray) #TESTING--------------------------- if self.testing: self.total_reward += reward int_action = self._choose_action(self.test_data_set, .05, cur_img, np.clip(reward, -1, 1)) if self.pause > 0: time.sleep(self.pause) #NOT TESTING--------------------------- else: if len(self.data_set) > self.replay_start_size: self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_rate) int_action = self._choose_action(self.data_set, self.epsilon, cur_img, np.clip(reward, -1, 1)) if self.step_counter % self.update_frequency == 0: loss = self._do_training() self.batch_counter += 1 self.loss_averages.append(loss) else: # Still gathering initial random data... int_action = self._choose_action(self.data_set, 1.0, cur_img, np.clip(reward, -1, 1)) return_action.intArray = [int_action] self.last_action = copy.deepcopy(return_action) self.last_img = cur_img return return_action
def agent_start(self, observation): state = observation.intArray[0] actionChoice = self.get_action(state) action = Action() action.intArray = [actionChoice] self.lastAction = copy.deepcopy(action) self.lastObservation = copy.deepcopy(observation) return action
def create_action(self,act): self.last_act=act if np.isscalar(act): act = np.array([act]) assert (act.size == self.action_dims()),'illegal action dimension' return_action=Action() if self.int_action_dims() > 0: return_action.intArray=[act[:self.int_action_dims()].astype(int)] if self.double_action_dims() > 0: return_action.doubleArray=[ act[self.double_action_dims():].astype(float)] return return_action
def create_action(self,act): self.last_act=act if np.isscalar(act): act = np.array([act]) assert (act.size == self.action_dims()),'illegal action dimension' return_action=Action() if self.int_action_dims() > 0: return_action.intArray=[act[:self.int_action_dims()].astype(int)] if self.double_action_dims() > 0: return_action.doubleArray=[ act[self.double_action_dims():].astype(float)] return return_action
def agent_start(self, observation): state = observation.intArray[0] actionChoice = self.get_action(state) action = Action() action.intArray = [actionChoice] self.lastAction = copy.deepcopy(action) self.lastObservation = copy.deepcopy(observation) return action