class test_empty_agent(Agent):
    whichEpisode = 0
    emptyAction = Action(0, 0, 0)
    nonEmptyAction = Action(7, 3, 1)

    def agent_init(self, taskSpec):
        self.whichEpisode = 0
        self.nonEmptyAction.intArray = (0, 1, 2, 3, 4, 5, 6)
        self.nonEmptyAction.doubleArray = (0.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0)
        self.nonEmptyAction.charArray = ('a')

    def agent_start(self, observation):
        self.whichEpisode = self.whichEpisode + 1

        if self.whichEpisode % 2 == 0:
            return self.emptyAction
        else:
            return self.nonEmptyAction

    def agent_step(self, reward, observation):
        if self.whichEpisode % 2 == 0:
            return self.emptyAction
        else:
            return self.nonEmptyAction

    def agent_end(self, reward):
        pass

    def agent_cleanup(self):
        pass

    def agent_message(self, inMessage):
        return ""
    def agent_step(self, reward, observation):
        """
        This method is called each time step. 

        Arguments: 
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns: 
           An action of type rlglue.types.Action
        
        """
        # Generate random action
        this_int_action = self.randGenerator.randint(0, self.num_actions - 1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        if self.show_ale:
            self._show_ale_color()
            #self._show_ale_gray()

        if self.saving:
            if self.int_states:
                self.states.append(self.last_observation.intArray)
            else:
                self.states.append(self.last_observation.doubleArray)

            self.actions.append(self.last_action.intArray[0])
            self.rewards.append(reward)
            self.absorbs.append(False)

        self.last_action = copy.deepcopy(return_action)
        self.last_observation = copy.deepcopy(observation)

        return return_action
Exemple #3
0
    def agent_start(self, observation):
        """Start an episode for the RL agent.

        Args:
            observation: The first observation of the episode. Should be an RLGlue Observation object.

        Returns:
            The first action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        log = logging.getLogger('pyrl.agents.sarsa_lambda.agent_start')
        theState = numpy.array(list(observation.doubleArray))
        thisIntAction = self.getAction(theState,
                                       self.getDiscState(observation.intArray))

        returnAction = Action()
        returnAction.intArray = [thisIntAction]

        # Clear traces
        self.traces.fill(0.0)

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        log.debug("Action: %d", thisIntAction)
        log.debug("Start State: %s", theState)
        log.debug("Traces: %s", self.traces)
        return returnAction
Exemple #4
0
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        self.last_action = copy.deepcopy(return_action)

        self.last_img = np.array(self._resize_observation(observation.intArray))
        self.last_img = self.last_img.reshape(CROPPED_WIDTH, CROPPED_HEIGHT).T

        return return_action
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        # this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray,
                                         dtype='float32')
        actions = self.action_network.fprop(observation_matrix)
        return_action = Action()
        return_action.doubleArray = actions

        self.last_action = copy.deepcopy(actions)

        self.last_observation = observation.doubleArray

        return return_action
Exemple #6
0
    def agent_init(self, taskSpecString):
        TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpecString)
        if TaskSpec.valid:
            assert len(TaskSpec.getIntObservations()
                       ) == 1, "expecting 1-dimensional discrete observations"
            assert len(TaskSpec.getDoubleObservations()
                       ) == 0, "expecting no continuous observations"
            assert not TaskSpec.isSpecial(
                TaskSpec.getIntObservations()[0][0]
            ), " expecting min observation to be a number not a special value"
            assert not TaskSpec.isSpecial(
                TaskSpec.getIntObservations()[0][1]
            ), " expecting max observation to be a number not a special value"
            self.numStates = TaskSpec.getIntObservations()[0][1] + 1

            assert len(TaskSpec.getIntActions()
                       ) == 1, "expecting 1-dimensional discrete actions"
            assert len(TaskSpec.getDoubleActions()
                       ) == 0, "expecting no continuous actions"
            assert not TaskSpec.isSpecial(
                TaskSpec.getIntActions()[0][0]
            ), " expecting min action to be a number not a special value"
            assert not TaskSpec.isSpecial(
                TaskSpec.getIntActions()[0][1]
            ), " expecting max action to be a number not a special value"
            self.numActions = TaskSpec.getIntActions()[0][1] + 1

            self.value_function = numpy.zeros(
                [self.numStates, self.numActions])
        else:
            print "Task Spec could not be parsed: " + taskSpecString

        self.lastAction = Action()
        self.lastObservation = Observation()
    def agent_init(self, taskSpecString):
        print "Agent Up"
        # print taskSpecString
        TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpecString)
        if TaskSpec.valid:
            print len(
                TaskSpec.getDoubleActions()), ": ", TaskSpec.getDoubleActions(
                ), '\n', len(TaskSpec.getDoubleObservations()
                             ), ": ", TaskSpec.getDoubleObservations()
            assert len(TaskSpec.getIntObservations()
                       ) == 0, "expecting no discrete observations"
            assert len(TaskSpec.getDoubleObservations(
            )) == 12, "expecting 12-dimensional continuous observations"

            assert len(
                TaskSpec.getIntActions()) == 0, "expecting no discrete actions"
            assert len(TaskSpec.getDoubleActions()
                       ) == 4, "expecting 4-dimensional continuous actions"

            self.obs_specs = TaskSpec.getDoubleObservations()
            self.actions_specs = TaskSpec.getDoubleActions()
            # print "Observations: ",self.obs_specs
            # print "actions_specs:", self.actions_specs

        else:
            print "Task Spec could not be parsed: " + taskSpecString

        self.lastAction = Action()
        self.lastObservation = Observation()
Exemple #8
0
    def agent_step(self, reward, observation):
        self.step_counter += 1
        self.total_reward += reward
        cur_img = self.resize_image(observation.intArray)

        if self.is_testing:
            int_action = self.choose_action(self.test_table,
                                            cur_img,
                                            np.clip(reward, -1, 1),
                                            testing_ep=0.05)
        else:
            if self.step_counter % self.reset_after == 0:
                self.network.reset_q_hat()

            int_action = self.choose_action(self.train_table,
                                            cur_img,
                                            np.clip(reward, -1, 1),
                                            testing_ep=None)
            if self.train_table.num_entries > max(self.learn_start,
                                                  self.batch_size):
                states, actions, rewards, next_states, terminals = self.train_table.get_minibatch(
                    self.batch_size)
                loss, qvals = self.network.train(states, actions, rewards,
                                                 next_states, terminals)
                self.losses.append(loss)
                self.qvals.append(np.mean(qvals))
                self.batch_counter += 1

        return_action = Action()
        return_action.intArray = [int_action]

        self.last_action = int_action
        self.last_img = cur_img

        return return_action
Exemple #9
0
    def agent_start(self, observation):
        if self.debug_flag: print('agent start')

        # stepを1増やす
        self.step_counter += 1

        #開始時にstateをクリアしないとだめじゃない?
        #self.state = np.zeros((1, self.n_frames, self.bdim)).astype(np.float32)
        self.state = np.zeros(
            (1, 2, self.n_rows, self.n_cols)).astype(np.float32)

        # kmori: 独自のobservationを使用して、状態をアップデート。
        # 一部サンプルに合わせ、残りは別の方法で作成した。
        self.update_state(observation)
        self.update_targetQ()

        if self.debug_flag: print('自分が打つ手を決定する。')

        # 自分が打つ手を決定する。
        int_action = self.select_int_action()
        action = Action()
        action.intArray = [int_action]
        if self.debug_flag: print('eps を更新する。')

        # eps を更新する。epsはランダムに○を打つ確率
        self.update_eps()

        # state = 盤の状態 と action = ○を打つ場所 を退避する
        self.last_state2 = copy.deepcopy(self.last_state)  # 2手前の状態
        self.last_action2 = copy.deepcopy(self.last_action)  # 2手前の行動
        self.last_state = copy.deepcopy(self.state)
        self.last_action = copy.deepcopy(int_action)

        return action
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]
        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)
        newIntAction = self.getAction(newState, newDiscState)

        phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState)
        phi_tp[newDiscState, :] = self.basis.computeFeatures(newState)

        self.step_count += 1
        self.update(
            phi_t, phi_tp, reward,
            self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp,
                                       newIntAction))

        returnAction = Action()
        returnAction.intArray = [newIntAction]
        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Exemple #11
0
	def agent_step(self,Reward,Obs):

		new_state = Obs.intArray[0]
		last_state = self.lastObs.intArray[0]
		last_action = self.lastaction.intArray[0]



		Q_sa = self.qfunction[last_state][last_action]
		Q_saprime = self.maxim(new_state)

		Q_new = Q_sa + self.learningrate*( Reward + self.gamma*Q_saprime - Q_sa)
		
		#if not self.pause:
		self.qfunction[last_state][last_action] = Q_new

		#To be taken
		new_action = self.epsilon_greedy(new_state)

		returnaction = Action()
		returnaction.intArray = [new_action]

		self.lastaction = copy.deepcopy(returnaction)
		self.lastObs = copy.deepcopy(Obs)

		return returnaction
Exemple #12
0
	def agent_init(self,taskSpecString):

		self.numActions = 4
		self.numStates = 144
		self.qfunction = [self.numActions*[0.0] for i in range(self.numStates)]
		self.lastAction=Action()
		self.lastObs=Observation()
Exemple #13
0
	def agent_step(self,Reward,Obs):

		new_state = Obs.intArray[0]
		last_state = self.lastObs.intArray[0]
		last_action = self.lastaction.intArray[0]

		new_action = self.epsilon_greedy(new_state)

		

		Q_sa = self.qfunction[last_state][last_action]
		Q_saprime = self.qfunction[new_state][new_action]

		delta = Reward + self.gamma*Q_saprime - Q_sa

		self.efunction[last_state][last_action] = self.efunction[last_state][last_action] + 1

		self.qfunction = np.array(self.qfunction)
		self.efunction = np.array(self.efunction)

		self.qfunction = self.qfunction + self.learningrate*delta*self.efunction

		self.efunction = self.gamma*self.lamda*self.efunction
		

		returnaction = Action()
		returnaction.intArray = [new_action]

		self.lastaction = copy.deepcopy(returnaction)
		self.lastObs = copy.deepcopy(Obs)

		return returnaction
Exemple #14
0
    def agent_init(self, taskSpec):
        
        # taskspec check
        TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec)
        if TaskSpec.valid:
            assert len(TaskSpec.getDoubleObservations())>0, "expecting at least one continuous observation"
            self.state_range = np.asarray(TaskSpec.getDoubleObservations())
            
            # Check action form, and then set number of actions
            assert len(TaskSpec.getIntActions())==0, "expecting no discrete actions"
            assert len(TaskSpec.getDoubleActions())==2, "expecting 1-dimensional continuous actions"

        else:
            print "Task Spec could not be parsed"
            
        self.lbounds=[]
        self.ubounds=[]
        
        for r in self.state_range:
            self.lbounds.append(r[0])
            self.ubounds.append(r[1])
            
        self.lbounds = np.array(self.lbounds)
        self.ubounds = np.array(self.ubounds)
        
        # Some initializations for rlglue
        self.lastAction = Action()

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate

        # Pick a DQN from DQN_class
        self.DQN = DQN_class()  
    def agent_init(self, taskSpecString):

        self.numActions = 4
        self.numStates = 144
        self.qfunction = [
            self.numActions * [0.0] for i in range(self.numStates)
        ]
        #x coordinate
        self.phi1 = np.array([i for i in range(12)])
        #y coordinate
        self.phi2 = np.array([i for i in range(12)])

        #self.theta = np.array([ for i in range(4)])
        self.thetax = np.array([[
            random.random(),
            random.random(),
            random.random(),
            random.random()
        ] for i in range(12)])
        self.thetay = np.array([[
            random.random(),
            random.random(),
            random.random(),
            random.random()
        ] for i in range(12)])
        self.thetaxy = np.array([[[
            random.random(),
            random.random(),
            random.random(),
            random.random()
        ] for i in range(12)] for j in range(12)])

        self.lastAction = Action()
        self.lastObs = Observation()
    def agent_step(self, reward, observation):

        # Preproces
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling
        obs_processed = np.maximum(
            obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray(
            [self.state[1], self.state[2], self.state[3], obs_processed],
            dtype=np.uint8)
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0 / 10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (
                    self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
            print "Policy is Frozen"
            eps = 0.05

        # Generate an Action from e-greedy action selection
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, eps)
        returnAction.intArray = [action]

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state,
                                     self.lastAction.intArray[0], reward,
                                     self.state, False)
            self.DQN.experienceReplay(self.time)

        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (
            self.time, self.DQN.action_to_index(action), np.sign(reward), eps,
            np.max(Q_now.get()))

        # Updates for next step
        self.last_observation = obs_array

        # Update for next step
        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
Exemple #17
0
    def agent_step(self, reward, obs):
        """ This function is called by the environment while the episode lasts.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.
        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        if not self.finished_learning:
            self.intraoption_update(reward, current_features, observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation,
                                   current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        # this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray,
                                         dtype='float32')
        actions = self.action_network.predict(observation_matrix)

        return_action = Action()
        return_action.doubleArray = actions

        self.last_action = copy.deepcopy(actions)
        self.last_state = np.asmatrix(observation.doubleArray, dtype=floatX)

        return return_action
    def agent_start(self, observation):

        # Get intensity from current observation array
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling

        # Initialize State
        self.state = np.zeros((4, 84, 84), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Generate an Action e-greedy
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
        returnAction.intArray = [action]

        # Update for next step
        self.lastAction = copy.deepcopy(returnAction)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return returnAction
Exemple #20
0
    def agent_step(self, reward, observation):
        action = None

        self.window.erase()
        self.window.addstr('STATE: %s\n' % (observation.intArray))
        self.window.addstr('REWARD: %s\n' % (reward))
        self.window.addstr('HIT UP, DOWN, LEFT or RIGHT to move...\n')
        self.window.refresh()

        try:
            c = self.window.getch()
            if c == curses.KEY_UP:
                action = 'N'
            elif c == curses.KEY_DOWN:
                action = 'S'
            elif c == curses.KEY_LEFT:
                action = 'W'
            elif c == curses.KEY_RIGHT:
                action = 'E'

            self.window.refresh()

        except KeyboardInterrupt:
            RLGlue.RL_cleanup()

        a = Action()

        if action:
            a.charArray = [action]

        return a
Exemple #21
0
    def do_step(self, state, reward=None):
        """
        Runs the actual learning algorithm.
        In a separate function so it can be called both on start and on step.
        """
        #self.debug('do_step(', state, ',', reward, ')')

        #if not state in self.Q:
        # State not yet visited, initialize randomly
        #    self.Q[state] = self.random_actions()

        # Run the Q update if this isn't the first step
        action = None

        if reward is not None:
            action = self.update_Q(self.last_state, self.last_action, reward,
                                   state)

        # Action object
        a_obj = Action()

        if action is None:
            # Query the policy to find the best action
            action = self.policy(state)

        a_obj.charArray = list(action)

        # Save the current state-action pair for the next step's Q update.
        self.last_state = state
        self.last_action = action

        # And we're done
        return a_obj
Exemple #22
0
    def _select_action(self, phi=None):
        """
        Utility function for selecting an action.

        phi: ndarray
            Memory from which action should be selected.
        """
        if self.action_count % self.k == 0:
            if (np.random.rand() > self.epsilon) and phi:
                # Get action from Q-function
                phi = np.array(phi)[:, :, :, None]
                action_int = self.action_func(phi)[0]
            else:
                # Get random action
                action_int = np.random.randint(0, len(self.action_map))
            self.action_log[action_int] += 1

            self.cmd = [0]*len(self.action_map)
            self.cmd[action_int] = 1

            # Map cmd to ALE action
            # 18 is the number of commands ALE accepts
            action = Action()
            action.intArray = [self.action_map[action_int]]
            self.action = action
Exemple #23
0
	def agent_step(self, reward, observation):
		observed_screen = self.preprocess_screen(observation)
		self.state = np.roll(self.state, 1, axis=0)
		self.state[0] = observed_screen

		########################### DEBUG ###############################
		# if self.total_time_step % 500 == 0 and self.total_time_step != 0:
		# 	self.dump_state()

		self.learn(reward)
		
		return_action = Action()
		q_max = None
		q_min = None
		if self.time_step % config.rl_action_repeat == 0:
			action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate)
		else:
			action = self.last_action.intArray[0]
		return_action.intArray = [action]

		self.dump_result(reward, q_max, q_min)

		if self.policy_frozen is False:
			self.last_action = copy.deepcopy(return_action)
			self.last_state = self.state
			self.time_step += 1
			self.total_time_step += 1

		return return_action
Exemple #24
0
    def agent_step(self, reward, observation):
        newState = observation.intArray[0]
        lastState = self.lastObservation.intArray[0]
        lastAction = self.lastAction.intArray[0]

        Q_sa = self.value_function[lastState][lastAction]
        Q_sprime_aprime = -500000
        for a in range(self.numberOfActions):
            if self.value_function[newState][a] > Q_sprime_aprime:
                Q_sprime_aprime = self.value_function[newState][a]
        #updating Q function
        new_Q_sa = Q_sa + self.sarsa_stepsize * (
            reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa)

        newIntAction = self.egreedy(newState)
        if not self.policyFrozen:
            self.value_function[lastState][lastAction] = new_Q_sa

        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        return returnAction
Exemple #25
0
    def agent_init(self, taskSpec):
        """Initialize the RL agent.

        Args:
            taskSpec: The RLGlue task specification string.
        """
        # (Re)initialize parameters (incase they have been changed during a trial
        self.init_parameters()
        # Parse the task specification and set up the weights and such
        TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec)
        if self.agent_supported(TaskSpec):
            self.numStates = len(TaskSpec.getDoubleObservations())
            self.discStates = numpy.array(TaskSpec.getIntObservations())
            self.numDiscStates = int(
                reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates,
                       1.0))
            self.numActions = TaskSpec.getIntActions()[0][1] + 1

            self.model.model_init(self.numDiscStates, TaskSpec.getDoubleObservations(), \
                              self.numActions, TaskSpec.getRewardRange()[0])
            self.planner.planner_init(self.numDiscStates, TaskSpec.getDoubleObservations(), \
                              self.numActions, TaskSpec.getRewardRange()[0])

        else:
            print "Task Spec could not be parsed: " + taskSpecString

        self.lastAction = Action()
        self.lastObservation = Observation()
Exemple #26
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        phi_t = numpy.zeros((self.numStates + 1, ))
        phi_tp = numpy.zeros((self.numStates + 1, ))
        phi_t[0] = lastDiscState
        phi_t[1:] = lastState
        phi_tp[0] = newDiscState
        phi_tp[1:] = newState

        #print ','.join(map(str, lastState))

        self.planner.updateExperience(phi_t, lastAction, phi_tp, reward)

        newIntAction = self.getAction(newState, newDiscState)
        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Exemple #27
0
 def agent_step(self, reward, observation):
     # ステップを1増加
     self.step_counter += 1
     
     self.update_state(observation)
     self.update_targetQ()
     
     # 自分が打つ手を決定する。
     int_action = self.select_int_action() # 戻り値が -1 ならパス。
     action = Action()
     action.intArray = [int_action]
     self.reward = reward
     
     # epsを更新
     self.update_eps()
     
     # データを保存 (状態、アクション、報酬、結果)
     self.store_transition(terminal=False)
     
     if not self.frozen:
         # 学習実行
         if self.step_counter > self.learn_start:
             self.replay_experience()
     
     self.last_state = copy.deepcopy(self.state)
     self.last_action = copy.deepcopy(int_action)
     
     # ○の位置をエージェントへ渡す
     return action
Exemple #28
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[lastDiscState, :,
              lastAction] = self.basis.computeFeatures(lastState)

        self.update_traces(phi_t, None)
        self.update(phi_t, newState, newDiscState, reward)

        # QLearning can choose action after update
        newIntAction = self.getAction(newState, newDiscState)
        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Exemple #29
0
    def agent_init(self, taskSpec):
        """Initialize the RL agent.

        Args:
            taskSpec: The RLGlue task specification string.
        """

        # (Re)initialize parameters (incase they have been changed during a trial
        self.init_parameters()
        # Parse the task specification and set up the weights and such
        TaskSpec = TaskSpecVRLGLUE3.TaskSpecParser(taskSpec)
        if not self.agent_supported(TaskSpec):
            print "Task Spec could not be parsed: " + taskSpecString
            sys.exit(1)

        self.numStates = len(TaskSpec.getDoubleObservations())
        self.discStates = numpy.array(TaskSpec.getIntObservations())
        self.numDiscStates = int(
            reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates, 1.0))
        self.numActions = TaskSpec.getIntActions()[0][1] + 1
        if self.numStates == 0:
            # Only discrete states
            self.numStates = 1
            if self.fa_name != "trivial":
                print "Selected basis requires at least one continuous feature. Using trivial basis."
                self.fa_name = "trivial"

        # Set up the function approximation
        if self.fa_name == 'fourier':
            self.basis = fourier.FourierBasis(self.numStates,
                                              TaskSpec.getDoubleObservations(),
                                              order=self.params.setdefault(
                                                  'fourier_order', 3))
        elif self.fa_name == 'rbf':
            num_functions = self.numStates if self.params.setdefault(
                'rbf_number', 0) == 0 else self.params['rbf_number']
            self.basis = rbf.RBFBasis(self.numStates,
                                      TaskSpec.getDoubleObservations(),
                                      num_functions=num_functions,
                                      beta=self.params.setdefault(
                                          'rbf_beta', 0.9))
        elif self.fa_name == 'tile':
            self.basis = tilecode.TileCodingBasis(
                self.numStates,
                TaskSpec.getDoubleObservations(),
                num_tiles=self.params.setdefault('tile_number', 100),
                num_weights=self.params.setdefault('tile_weights', 2048))
        else:
            self.basis = trivial.TrivialBasis(self.numStates,
                                              TaskSpec.getDoubleObservations())

        self.weights = numpy.zeros(
            (self.numDiscStates, self.basis.getNumBasisFunctions(),
             self.numActions))
        self.traces = numpy.zeros(self.weights.shape)
        self.init_stepsize(self.weights.shape, self.params)

        self.lastAction = Action()
        self.lastObservation = Observation()
Exemple #30
0
    def agent_step(self, reward, observation):
        self.stepCount = self.stepCount + 1
        action = Action()
        action.intArray = observation.intArray
        action.doubleArray = observation.doubleArray
        action.charArray = observation.charArray

        return action