def agent_step(self,reward, observation):
		
		import math
	
		 
		# function sigmoid
		if(self.Episode_Counter>10000):
			self.Epsilon = 0		
		else:
			self.Epsilon =   util.randomSigmoidEpsilon(reward,0.02,50)
 		
		self.Steps +=1
		

		print reward 

		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
    def agent_step(self, reward, observation):
        """
        This method is called each time step. 

        Arguments: 
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns: 
           An action of type rlglue.types.Action
        
        """
        # Generate random action
        this_int_action=self.randGenerator.randint(0,self.num_actions-1)
        return_action=Action()
        return_action.intArray=[this_int_action]
        
        if self.show_ale:
            self._show_ale_color()
            #self._show_ale_gray()

        if self.saving:
            if self.int_states:
                self.states.append(self.last_observation.intArray)
            else:
                self.states.append(self.last_observation.doubleArray)

            self.actions.append(self.last_action.intArray[0])
            self.rewards.append(reward)
            self.absorbs.append(False)

        self.last_action=copy.deepcopy(return_action)
        self.last_observation=copy.deepcopy(observation)

        return return_action
Esempio n. 3
0
    def agent_step(self, reward, obs):
        """ This function is called by the environment while the episode lasts.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.
        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        if not self.finished_learning:
            self.intraoption_update(reward, current_features, observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation,
                                   current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action
    def agent_step(self, reward, observation):
        lastState = self.lastObservation.intArray
        lastAction = self.lastAction.intArray
        lastStateId = SamplingUtility.getStateId(lastState)
        lastActionIdx = self.all_allowed_actions[lastStateId].index(tuple(lastAction))
        if reward == self.Bad_Action_Penalty:
            self.all_allowed_actions[lastStateId].pop(lastActionIdx)
            self.Q_value_function[lastStateId].pop(lastActionIdx)
            newAction = self.egreedy(self.lastObservation.intArray)
            returnAction = Action()
            returnAction.intArray = newAction
            self.lastAction = copy.deepcopy(returnAction)
            return returnAction

        newState = observation.intArray
        newAction = self.egreedy(newState)
        if type(newAction) is tuple:
            newAction = list(newAction)
        Q_sa = self.Q_value_function[lastStateId][lastActionIdx]
        Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(newState)][
                          self.all_allowed_actions[SamplingUtility.getStateId(newState)].index(tuple(newAction))]
        new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa)
        if not self.policyFrozen:
            self.Q_value_function[SamplingUtility.getStateId(lastState)][
            self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa
        returnAction = Action()
        returnAction.intArray = newAction
        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Esempio n. 5
0
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        #this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32')
        actions = self.action_network.fprop(observation_matrix)
        return_action = Action()
        return_action.doubleArray = [actions]

        self.last_action = copy.deepcopy(return_action)

        self.last_observation = observation.doubleArray

        return return_action
Esempio n. 6
0
    def agent_step(self, reward, observation):

        # Preproces
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling
        obs_processed = np.maximum(
            obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray(
            [self.state[1], self.state[2], self.state[3], obs_processed],
            dtype=np.uint8)
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0 / 10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (
                    self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
            print "Policy is Frozen"
            eps = 0.05

        # Generate an Action from e-greedy action selection
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, eps)
        returnAction.intArray = [action]

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state,
                                     self.lastAction.intArray[0], reward,
                                     self.state, False)
            self.DQN.experienceReplay(self.time)

        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (
            self.time, self.DQN.action_to_index(action), np.sign(reward), eps,
            np.max(Q_now.get()))

        # Updates for next step
        self.last_observation = obs_array

        # Update for next step
        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
	def agent_step(self,reward, observation):
		
		import math
	
		self.Rewards += reward 

		# function sigmoid
		if(self.Episode_Counter>Training_Runs):
			self.Epsilon = 0		
	 
 		
		self.Steps +=1
		
 
		
		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
Esempio n. 8
0
    def agent_step(self, reward, observation):
        self.step_counter += 1
        self.total_reward += reward
        cur_img = self.resize_image(observation.intArray)

        if self.is_testing:
            int_action = self.choose_action(self.test_table,
                                            cur_img,
                                            np.clip(reward, -1, 1),
                                            testing_ep=0.05)
        else:
            if self.step_counter % self.reset_after == 0:
                self.network.reset_q_hat()

            int_action = self.choose_action(self.train_table,
                                            cur_img,
                                            np.clip(reward, -1, 1),
                                            testing_ep=None)
            if self.train_table.num_entries > max(self.learn_start,
                                                  self.batch_size):
                states, actions, rewards, next_states, terminals = self.train_table.get_minibatch(
                    self.batch_size)
                loss, qvals = self.network.train(states, actions, rewards,
                                                 next_states, terminals)
                self.losses.append(loss)
                self.qvals.append(np.mean(qvals))
                self.batch_counter += 1

        return_action = Action()
        return_action.intArray = [int_action]

        self.last_action = int_action
        self.last_img = cur_img

        return return_action
Esempio n. 9
0
 def do_step(self, state, reward = None):
     """
     Runs the actual learning algorithm.
     In a separate function so it can be called both on start and on step.
     """
     #self.debug('do_step(', state, ',', reward, ')')
     
     #if not state in self.Q:
         # State not yet visited, initialize randomly
     #    self.Q[state] = self.random_actions()
     
     # Run the Q update if this isn't the first step
     action = None
     
     if reward is not None:
         action = self.update_Q(self.last_state, self.last_action, reward, state)
     
     # Action object
     a_obj = Action()
     
     if action is None:
         # Query the policy to find the best action
         action = self.policy(state)
     
     a_obj.charArray = list(action)
     
     # Save the current state-action pair for the next step's Q update.
     self.last_state = state
     self.last_action = action
     
     # And we're done
     return a_obj
Esempio n. 10
0
    def agent_start(self, observation):
        if self.debug_flag: print('agent start')

        # stepを1増やす
        self.step_counter += 1

        #開始時にstateをクリアしないとだめじゃない?
        #self.state = np.zeros((1, self.n_frames, self.bdim)).astype(np.float32)
        self.state = np.zeros(
            (1, 2, self.n_rows, self.n_cols)).astype(np.float32)

        # kmori: 独自のobservationを使用して、状態をアップデート。
        # 一部サンプルに合わせ、残りは別の方法で作成した。
        self.update_state(observation)
        self.update_targetQ()

        if self.debug_flag: print('自分が打つ手を決定する。')

        # 自分が打つ手を決定する。
        int_action = self.select_int_action()
        action = Action()
        action.intArray = [int_action]
        if self.debug_flag: print('eps を更新する。')

        # eps を更新する。epsはランダムに○を打つ確率
        self.update_eps()

        # state = 盤の状態 と action = ○を打つ場所 を退避する
        self.last_state2 = copy.deepcopy(self.last_state)  # 2手前の状態
        self.last_action2 = copy.deepcopy(self.last_action)  # 2手前の行動
        self.last_state = copy.deepcopy(self.state)
        self.last_action = copy.deepcopy(int_action)

        return action
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        # this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray,
                                         dtype='float32')
        actions = self.action_network.fprop(observation_matrix)
        return_action = Action()
        return_action.doubleArray = actions

        self.last_action = copy.deepcopy(actions)

        self.last_observation = observation.doubleArray

        return return_action
class test_empty_agent(Agent):
    whichEpisode = 0
    emptyAction = Action(0, 0, 0)
    nonEmptyAction = Action(7, 3, 1)

    def agent_init(self, taskSpec):
        self.whichEpisode = 0
        self.nonEmptyAction.intArray = (0, 1, 2, 3, 4, 5, 6)
        self.nonEmptyAction.doubleArray = (0.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0)
        self.nonEmptyAction.charArray = ('a')

    def agent_start(self, observation):
        self.whichEpisode = self.whichEpisode + 1

        if self.whichEpisode % 2 == 0:
            return self.emptyAction
        else:
            return self.nonEmptyAction

    def agent_step(self, reward, observation):
        if self.whichEpisode % 2 == 0:
            return self.emptyAction
        else:
            return self.nonEmptyAction

    def agent_end(self, reward):
        pass

    def agent_cleanup(self):
        pass

    def agent_message(self, inMessage):
        return ""
Esempio n. 13
0
 def agent_start(self, observation):
     #Generate random action, 0 or 1
     return_action = Action()
     return_action.intArray = []
     for i in xrange(0,self.action_size):
         return_action.intArray += [self.rng.randint(self.action_bounds[i][0],self.action_bounds[i][1])]
     return return_action
Esempio n. 14
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        new_state = numpy.array(list(observation.doubleArray))
        last_state = numpy.array(list(self.last_observation.doubleArray))
        last_action = self.last_action.intArray[0]

        new_disc_state = self.getDiscState(observation.intArray)
        last_disc_state = self.getDiscState(self.last_observation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[last_disc_state, :, last_action] = self.basis.computeFeatures(last_state)

        self.update_traces(phi_t, None)
        self.update(phi_t, new_state, new_disc_state, reward)

        # QLearning can choose action after update
        new_int_action = self.getAction(new_state, new_disc_state)
        return_action = Action()
        return_action.intArray = [new_int_action]

        self.last_action = copy.deepcopy(return_action)
        self.last_observation = copy.deepcopy(observation)
        return return_action
	def agent_step(self,reward, observation):
		
		import math
	
		 
		# try to set the epsilon inverse of reward that we have got 
		
		if(self.Episode_Counter >10000):
			self.Epsilon = 0.0
 		
		self.Steps +=1
		

		print reward 

		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
Esempio n. 16
0
    def agent_step(self, reward, obs):
        """ This function is called by the environment while the episode lasts.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.
        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        if not self.finished_learning:
            self.intraoption_update(reward, current_features, observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation, current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action
Esempio n. 17
0
	def agent_step(self,Reward,Obs):

		new_state = Obs.intArray[0]
		last_state = self.lastObs.intArray[0]
		last_action = self.lastaction.intArray[0]



		Q_sa = self.qfunction[last_state][last_action]
		Q_saprime = self.maxim(new_state)

		Q_new = Q_sa + self.learningrate*( Reward + self.gamma*Q_saprime - Q_sa)
		
		#if not self.pause:
		self.qfunction[last_state][last_action] = Q_new

		#To be taken
		new_action = self.epsilon_greedy(new_state)

		returnaction = Action()
		returnaction.intArray = [new_action]

		self.lastaction = copy.deepcopy(returnaction)
		self.lastObs = copy.deepcopy(Obs)

		return returnaction
Esempio n. 18
0
	def agent_step(self,Reward,Obs):

		new_state = Obs.intArray[0]
		last_state = self.lastObs.intArray[0]
		last_action = self.lastaction.intArray[0]

		new_action = self.epsilon_greedy(new_state)

		

		Q_sa = self.qfunction[last_state][last_action]
		Q_saprime = self.qfunction[new_state][new_action]

		delta = Reward + self.gamma*Q_saprime - Q_sa

		self.efunction[last_state][last_action] = self.efunction[last_state][last_action] + 1

		self.qfunction = np.array(self.qfunction)
		self.efunction = np.array(self.efunction)

		self.qfunction = self.qfunction + self.learningrate*delta*self.efunction

		self.efunction = self.gamma*self.lamda*self.efunction
		

		returnaction = Action()
		returnaction.intArray = [new_action]

		self.lastaction = copy.deepcopy(returnaction)
		self.lastObs = copy.deepcopy(Obs)

		return returnaction
	def agent_step(self,reward, observation): 
		 		
		self.reward += reward
		self.step += 1
	        self.total_reward += reward
		
		thisDoubleAction=self.agent_step_action(observation.doubleArray)  
		if(self.isRisk(observation.doubleArray,thisDoubleAction)):
		 	self.times += 1
			thisDoubleAction = util.baselinePolicy(observation.doubleArray)  
			from pybrain.supervised.trainers import BackpropTrainer
			from pybrain.datasets import SupervisedDataSet
			ds = SupervisedDataSet(12, 4)
			ds.addSample(observation.doubleArray,self.best.activate(observation.doubleArray))	
			trainer = BackpropTrainer(self.network, ds)
			trainer.train()
  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		self.lastReward = reward 
		return returnAction
Esempio n. 20
0
	def agent_step(self,reward, observation):
		self.states_diff_list.append([a - b for (a, b) in zip (observation.doubleArray, self.lastObservation.doubleArray)])

		self.lastObservation=copy.deepcopy(observation)
		 
		self.approximateValueFunction()		
		#end of test
		print reward
		#test how reward approximation works
		self.approximateRewardFunction(reward,observation)
		 
		#end of test

		thisDoubleAction=self.approximateAction()  
		  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		#approximate value function 
		self.action_list.append(thisDoubleAction)
		#end of test


		
		
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
Esempio n. 21
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]
        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)
        newIntAction = self.getAction(newState, newDiscState)

        phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState)
        phi_tp[newDiscState, :] = self.basis.computeFeatures(newState)

        self.step_count += 1
        self.update(
            phi_t, phi_tp, reward,
            self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp,
                                       newIntAction))

        returnAction = Action()
        returnAction.intArray = [newIntAction]
        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Esempio n. 22
0
File: agent.py Progetto: npow/atari
  def agent_step(self, reward, observation):
    self.step_counter += 1
    self.total_reward += reward
    cur_img = self.resize_image(observation.intArray)

    if self.is_testing:
      int_action = self.choose_action(self.test_table, cur_img, np.clip(reward, -1, 1), testing_ep=0.05)
    else:
      if self.step_counter % self.reset_after == 0:
        self.network.reset_q_hat()

      int_action = self.choose_action(self.train_table, cur_img, np.clip(reward, -1, 1), testing_ep=None)
      if self.train_table.num_entries > max(self.learn_start, self.batch_size):
        states, actions, rewards, next_states, terminals = self.train_table.get_minibatch(self.batch_size)
        loss, qvals = self.network.train(states, actions, rewards, next_states, terminals)
        self.losses.append(loss)
        self.qvals.append(np.mean(qvals))
        self.batch_counter += 1

    return_action = Action()
    return_action.intArray = [int_action]

    self.last_action = int_action
    self.last_img = cur_img

    return return_action
Esempio n. 23
0
    def agent_step(self,reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]
        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)
        newIntAction = self.getAction(newState, newDiscState)

        phi_t = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_tp = numpy.zeros((self.weights.shape[0], self.weights.shape[1]))
        phi_t[lastDiscState, :] = self.basis.computeFeatures(lastState)
        phi_tp[newDiscState, :] = self.basis.computeFeatures(newState)

        self.step_count += 1
        self.update(phi_t, phi_tp, reward, self.getCompatibleFeatures(phi_t, lastAction, reward, phi_tp, newIntAction))

        returnAction=Action()
        returnAction.intArray=[newIntAction]
        self.lastAction=copy.deepcopy(returnAction)
        self.lastObservation=copy.deepcopy(observation)
        return returnAction
Esempio n. 24
0
    def agent_step(self, reward, observation):
        action = None
        
        self.window.erase()
        self.window.addstr('STATE: %s\n' % (observation.intArray))
        self.window.addstr('REWARD: %s\n' % (reward))
        self.window.addstr('HIT UP, DOWN, LEFT or RIGHT to move...\n')
        self.window.refresh()

        try:
            c = self.window.getch()
            if c == curses.KEY_UP:
                action = 'N'
            elif c == curses.KEY_DOWN:
                action = 'S'
            elif c == curses.KEY_LEFT:
                action = 'W'
            elif c == curses.KEY_RIGHT:
                action = 'E'
            
            self.window.refresh()
        
        except KeyboardInterrupt:
            RLGlue.RL_cleanup()
            
        
        a = Action()
        
        if action:
            a.charArray = [action]
        
        return a
Esempio n. 25
0
    def agent_start(self,observation):
        self.P = np.asarray([[0.0 for j in range(self.N_AC)] for i in range(self.N_PC)])

        theState=observation.doubleArray

        if dynamicEpsilon=='1':
            self.q_epsilon = 0.3-0.005*self.episode
        else:
            self.q_epsilon = 0.3

        r_PC = self.getProbGaussians(theState[0], theState[1]) 
        res = self.egreedy(theState, r_PC)
        phi_AC = res[0]
        r_1_AC = res[1]
        r_2_AC = []
        for i in xrange(self.N_AC):
            r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) ))

        # Update P_ij
        for i in xrange(self.N_AC):
            for j in xrange(self.N_PC):
                self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j]

        returnAction=Action()
        returnAction.doubleArray=[phi_AC]
        
        # finding closest AC
        closest_AC = r_2_AC.index(max(r_2_AC))
        self.lastQ = r_1_AC[closest_AC]

        self.episode += 1

        return returnAction
Esempio n. 26
0
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        self.last_action = copy.deepcopy(return_action)

        self.last_img = np.array(self._resize_observation(observation.intArray))
        self.last_img = self.last_img.reshape(CROPPED_WIDTH, CROPPED_HEIGHT).T

        return return_action
Esempio n. 27
0
    def agent_start(self, observation):
        """Start an episode for the RL agent.

        Args:
            observation: The first observation of the episode. Should be an RLGlue Observation object.

        Returns:
            The first action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        log = logging.getLogger('pyrl.agents.sarsa_lambda.agent_start')
        theState = numpy.array(list(observation.doubleArray))
        thisIntAction = self.getAction(theState,
                                       self.getDiscState(observation.intArray))

        returnAction = Action()
        returnAction.intArray = [thisIntAction]

        # Clear traces
        self.traces.fill(0.0)

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        log.debug("Action: %d", thisIntAction)
        log.debug("Start State: %s", theState)
        log.debug("Traces: %s", self.traces)
        return returnAction
Esempio n. 28
0
    def agent_step(self, reward, observation):
        """
        This method is called each time step. 

        Arguments: 
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns: 
           An action of type rlglue.types.Action
        
        """
        # Generate random action
        this_int_action = self.randGenerator.randint(0, self.num_actions - 1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        if self.show_ale:
            self._show_ale_color()
            #self._show_ale_gray()

        if self.saving:
            if self.int_states:
                self.states.append(self.last_observation.intArray)
            else:
                self.states.append(self.last_observation.doubleArray)

            self.actions.append(self.last_action.intArray[0])
            self.rewards.append(reward)
            self.absorbs.append(False)

        self.last_action = copy.deepcopy(return_action)
        self.last_observation = copy.deepcopy(observation)

        return return_action
Esempio n. 29
0
    def agent_step(self,reward, observation):
        theState=observation.doubleArray

        r_PC = self.getProbGaussians(theState[0], theState[1])    
        res = self.egreedy(theState, r_PC)
        phi_AC = res[0]
        r_1_AC = res[1]
        r_2_AC = []
        for i in xrange(self.N_AC):
            r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) ))

        # Calculate reward prediction error
        delta = reward + self.q_gamma*max(r_1_AC) - self.lastQ
        #print self.q_gamma*max(r_1_AC), self.lastQ

        # Update synaptic weights
        for i in xrange(self.N_PC):
            for j in xrange(self.N_AC):       
                self.W[i,j] = self.q_stepsize * delta * self.P[i,j]

        # Update P_ij
        for i in xrange(self.N_AC):
            for j in xrange(self.N_PC):
                self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j]

        returnAction=Action()
        returnAction.doubleArray=[phi_AC]
        
        # finding closest AC
        closest_AC = r_2_AC.index(max(r_2_AC))
        self.lastQ = r_1_AC[closest_AC]

        self.episode += 1

        return returnAction
Esempio n. 30
0
    def agent_step(self,reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        phi_t = numpy.zeros((self.numStates+1,))
        phi_tp = numpy.zeros((self.numStates+1,))
        phi_t[0] = lastDiscState
        phi_t[1:] = lastState
        phi_tp[0] = newDiscState
        phi_tp[1:] = newState

        #print ','.join(map(str, lastState))

        self.planner.updateExperience(phi_t, lastAction, phi_tp, reward)

        newIntAction = self.getAction(newState, newDiscState)
        returnAction=Action()
        returnAction.intArray=[newIntAction]

        self.lastAction=copy.deepcopy(returnAction)
        self.lastObservation=copy.deepcopy(observation)
        return returnAction
Esempio n. 31
0
 def create_action(self, action):
     if np.isscalar(action):
         action = np.array([action])
     return_action = Action()
     return_action.intArray = [
         action[:self.learner.dim_action()].astype(int)]
     return return_action
Esempio n. 32
0
    def agent_start(self, observation):

        # Get intensity from current observation array
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling

        # Initialize State
        self.state = np.zeros((4, 84, 84), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Generate an Action e-greedy
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
        returnAction.intArray = [action]

        # Update for next step
        self.lastAction = copy.deepcopy(returnAction)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return returnAction
Esempio n. 33
0
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        return_action = Action()
        return_action.intArray = [this_int_action]

        self.last_action = copy.deepcopy(return_action)

        self.last_img = self._resize_observation(observation.intArray)

        return return_action
	def agent_start(self, observation):
		'''
		initialize the episode strategy
		'''
		#Generate action, query 0,0
		action = Action()
		action.charArray.append('q')
		action.intArray = [1, 0, 0]
		# increment strategy
		self.strategyIndex += 1
		#add 1st node (0,0) and North with arrow to the partial nodes
		initPartialNode = Node()
		self.partialStateNodes = [initPartialNode]
		#initialize new queue according to strategy
		self.newQueu()
		#set the agenda
		self.agenda = self.EAGLE
		#reset the pointer to the action path
		self.pathToGoalIndex = -1 
		self.visited.fill(False)
		self.depthq=[]
		# to measure performance
		self.numExpandedNodes=0
		self.startTime=time.time()
		#print 'End the method start'
		return action
Esempio n. 35
0
    def agent_step(self, reward, observation):
        action = None

        self.window.erase()
        self.window.addstr('STATE: %s\n' % (observation.intArray))
        self.window.addstr('REWARD: %s\n' % (reward))
        self.window.addstr('HIT UP, DOWN, LEFT or RIGHT to move...\n')
        self.window.refresh()

        try:
            c = self.window.getch()
            if c == curses.KEY_UP:
                action = 'N'
            elif c == curses.KEY_DOWN:
                action = 'S'
            elif c == curses.KEY_LEFT:
                action = 'W'
            elif c == curses.KEY_RIGHT:
                action = 'E'

            self.window.refresh()

        except KeyboardInterrupt:
            RLGlue.RL_cleanup()

        a = Action()

        if action:
            a.charArray = [action]

        return a
Esempio n. 36
0
	def agent_step(self, reward, observation):
		observed_screen = self.preprocess_screen(observation)
		self.state = np.roll(self.state, 1, axis=0)
		self.state[0] = observed_screen

		########################### DEBUG ###############################
		# if self.total_time_step % 500 == 0 and self.total_time_step != 0:
		# 	self.dump_state()

		self.learn(reward)
		
		return_action = Action()
		q_max = None
		q_min = None
		if self.time_step % config.rl_action_repeat == 0:
			action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate)
		else:
			action = self.last_action.intArray[0]
		return_action.intArray = [action]

		self.dump_result(reward, q_max, q_min)

		if self.policy_frozen is False:
			self.last_action = copy.deepcopy(return_action)
			self.last_state = self.state
			self.time_step += 1
			self.total_time_step += 1

		return return_action
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        # this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray,
                                         dtype='float32')
        actions = self.action_network.predict(observation_matrix)

        return_action = Action()
        return_action.doubleArray = actions

        self.last_action = copy.deepcopy(actions)
        self.last_state = np.asmatrix(observation.doubleArray, dtype=floatX)

        return return_action
Esempio n. 38
0
	def agent_step(self,reward, observation):
		#Generate random action, 0 or 1
		#print "actual===reward",reward
		#print "actual observation====  ",observation.doubleArray
		#approximate value function		
		self.lastObservation=copy.deepcopy(observation)
		 
		self.next_observation_list.append(observation.doubleArray)               
		self.approximateKernelFunction()		
		#end of test
		print reward
		#test how reward approximation works
		self.approximateRewardFunction(reward,observation)
		 
		#end of test

		thisDoubleAction=self.approximateAction()  
		  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		#approximate value function 
		self.action_list.append(thisDoubleAction)
		self.last_observation_list.append(observation.doubleArray)
		#end of test


		
		
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
Esempio n. 39
0
    def do_step(self, state, reward=None):
        """
        Runs the actual learning algorithm.
        In a separate function so it can be called both on start and on step.
        """
        #self.debug('do_step(', state, ',', reward, ')')

        #if not state in self.Q:
        # State not yet visited, initialize randomly
        #    self.Q[state] = self.random_actions()

        # Run the Q update if this isn't the first step
        action = None

        if reward is not None:
            action = self.update_Q(self.last_state, self.last_action, reward,
                                   state)

        # Action object
        a_obj = Action()

        if action is None:
            # Query the policy to find the best action
            action = self.policy(state)

        a_obj.charArray = list(action)

        # Save the current state-action pair for the next step's Q update.
        self.last_state = state
        self.last_action = action

        # And we're done
        return a_obj
Esempio n. 40
0
    def _select_action(self, phi=None):
        """
        Utility function for selecting an action.

        phi: ndarray
            Memory from which action should be selected.
        """
        if self.action_count % self.k == 0:
            if (np.random.rand() > self.epsilon) and phi:
                # Get action from Q-function
                phi = np.array(phi)[:, :, :, None]
                action_int = self.action_func(phi)[0]
            else:
                # Get random action
                action_int = np.random.randint(0, len(self.action_map))
            self.action_log[action_int] += 1

            self.cmd = [0]*len(self.action_map)
            self.cmd[action_int] = 1

            # Map cmd to ALE action
            # 18 is the number of commands ALE accepts
            action = Action()
            action.intArray = [self.action_map[action_int]]
            self.action = action
Esempio n. 41
0
	def agent_step(self,reward, observation):
		
		import math
	
	 
		if(self.Episode_Counter>5000):
			self.Epsilon = 0		
	 
 		
		self.Steps +=1
		

		print reward 

		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
Esempio n. 42
0
    def agent_step(self, reward, observation):
        newState = observation.intArray[0]
        lastState = self.lastObservation.intArray[0]
        lastAction = self.lastAction.intArray[0]

        Q_sa = self.value_function[lastState][lastAction]
        Q_sprime_aprime = -500000
        for a in range(self.numberOfActions):
            if self.value_function[newState][a] > Q_sprime_aprime:
                Q_sprime_aprime = self.value_function[newState][a]
        #updating Q function
        new_Q_sa = Q_sa + self.sarsa_stepsize * (
            reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa)

        newIntAction = self.egreedy(newState)
        if not self.policyFrozen:
            self.value_function[lastState][lastAction] = new_Q_sa

        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        return returnAction
Esempio n. 43
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """
        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        phi_t = numpy.zeros((self.numStates + 1, ))
        phi_tp = numpy.zeros((self.numStates + 1, ))
        phi_t[0] = lastDiscState
        phi_t[1:] = lastState
        phi_tp[0] = newDiscState
        phi_tp[1:] = newState

        #print ','.join(map(str, lastState))

        self.planner.updateExperience(phi_t, lastAction, phi_tp, reward)

        newIntAction = self.getAction(newState, newDiscState)
        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Esempio n. 44
0
 def agent_step(self, reward, observation):
     # ステップを1増加
     self.step_counter += 1
     
     self.update_state(observation)
     self.update_targetQ()
     
     # 自分が打つ手を決定する。
     int_action = self.select_int_action() # 戻り値が -1 ならパス。
     action = Action()
     action.intArray = [int_action]
     self.reward = reward
     
     # epsを更新
     self.update_eps()
     
     # データを保存 (状態、アクション、報酬、結果)
     self.store_transition(terminal=False)
     
     if not self.frozen:
         # 学習実行
         if self.step_counter > self.learn_start:
             self.replay_experience()
     
     self.last_state = copy.deepcopy(self.state)
     self.last_action = copy.deepcopy(int_action)
     
     # ○の位置をエージェントへ渡す
     return action
Esempio n. 45
0
    def agent_step(self, reward, observation):
        """Take one step in an episode for the agent, as the result of taking the last action.

        Args:
            reward: The reward received for taking the last action from the previous state.
            observation: The next observation of the episode, which is the consequence of taking the previous action.

        Returns:
            The next action the RL agent chooses to take, represented as an RLGlue Action object.
        """

        newState = numpy.array(list(observation.doubleArray))
        lastState = numpy.array(list(self.lastObservation.doubleArray))
        lastAction = self.lastAction.intArray[0]

        newDiscState = self.getDiscState(observation.intArray)
        lastDiscState = self.getDiscState(self.lastObservation.intArray)

        # Update eligibility traces
        phi_t = numpy.zeros(self.traces.shape)
        phi_t[lastDiscState, :,
              lastAction] = self.basis.computeFeatures(lastState)

        self.update_traces(phi_t, None)
        self.update(phi_t, newState, newDiscState, reward)

        # QLearning can choose action after update
        newIntAction = self.getAction(newState, newDiscState)
        returnAction = Action()
        returnAction.intArray = [newIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)
        return returnAction
Esempio n. 46
0
    def agent_step(self,reward, observation):
        self.stepCount=self.stepCount+1
        action=Action()
        action.intArray=observation.intArray
        action.doubleArray=observation.doubleArray
        action.charArray=observation.charArray

        return action
Esempio n. 47
0
    def agent_start(self,observation):
        self.stepCount=0
        action=Action()
        action.intArray=observation.intArray
        action.doubleArray=observation.doubleArray
        action.charArray=observation.charArray

        return action
Esempio n. 48
0
    def agent_start(self, observation):
        self.stepCount = 0
        action = Action()
        action.intArray = observation.intArray
        action.doubleArray = observation.doubleArray
        action.charArray = observation.charArray

        return action
Esempio n. 49
0
    def agent_step(self, reward, observation):
        self.stepCount = self.stepCount + 1
        action = Action()
        action.intArray = observation.intArray
        action.doubleArray = observation.doubleArray
        action.charArray = observation.charArray

        return action
Esempio n. 50
0
    def agent_step(self, reward, observation):

        # Observation
        obs_array = np.array(observation.doubleArray)
        #print "state: %3f %3f %3f %3f" % (obs_array[0],obs_array[1],obs_array[2],obs_array[3])
        # Compose State : 4-step sequential observation
        #self.state = self.rescale_value(obs_array)
        self.state = obs_array
        #print("state2:"+str(self.state))
        #print "state: %3f %3f %3f %3f" % (self.state[0],self.state[1],self.state[2],self.state[3])
        state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1,12), dtype=np.float32))
        #print("state2_:"+str(state_))
        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0/10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d / %d steps" % (self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
                print "Policy is Frozen"
                eps = 0.05

        # Generate an Action by e-greedy action selection
        returnAction = Action()
        action = self.DQN.e_greedy(state_, eps)
        #print(str(action))
        returnAction.doubleArray = action[0].tolist()

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time,
                                     self.last_state,
                                     np.asarray(self.lastAction.doubleArray,dtype=np.float32),
                                     reward,
                                     self.state, False)
            self.DQN.experienceReplay(self.time)

        # Target model update
#        if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
#            print "########### MODEL UPDATED ######################"
#            self.DQN.hard_target_model_update()

        # Simple text based visualization
        print 'Time Step %d / ACTION  %s / REWARD %.5f / EPSILON  %.5f' % (self.time,str(action[0]),reward,eps)

        # Updates for next step
        self.last_observation = obs_array

        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
Esempio n. 51
0
 def _getAction(self):
     """
     Return a RLGlue action that is made out of a numpy array yielded by
     the hold pybrain agent.
     """
     action = RLGlueAction()
     action.doubleArray = self.agent.getAction().tolist()
     action.intArray = []
     return action
	def getCellsNeededForDiscovery(self, node):
		'''
		Generate list I contains locations to discover and send action with them
		'''
		newPosition = self.newPosition(node.state.position, node.state.orintation)[1]
		action = Action()
		action.intArray = [1, newPosition[0], newPosition[1]]
		action.charArray.append('q')
		return action
Esempio n. 53
0
 def getRandomAction(self, mindir=-1, run = 0):
     action = Action(3, 0)
     # direction (left: -1, right: 1, neither: 0)
     action.intArray[0] = random.randint(mindir,1)
     # jumping (yes: 1, no: 0)
     action.intArray[1] = random.randint(0,1)
     # speed button (on: 1, off: 0)
     action.intArray[2] = random.randint(run,1)
     return action
Esempio n. 54
0
    def agent_step(self, reward, observation):
        global  save_flg2
        # Preproces
        tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling

        # 前回の結果を重ねたものを使用する
        obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames with elementwise

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) # idx:0 を飛ばしてずらす
        state_ = np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)

        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0/10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
                print "Policy is Frozen"
                eps = 0.05

        # Generate an Action by e-greedy action selection
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, eps)
        returnAction.intArray = [action]

        # Learning Phase
        loss_val = 0
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False)
            loss_val = self.DQN.experienceReplay(self.time)

        # Target model update
        if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
            print "########### MODEL UPDATED ######################"
            self.DQN.target_model_update()

        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now))
        logger.info("{},{},{},{},{},{}".format(dt.now().strftime("%Y-%m-%d_%H:%M:%S"), \
              self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now)))

        # Updates for next step
        self.last_observation = obs_array

        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
Esempio n. 55
0
    def agent_start(self,observation):

        self.optionCurrentlyOn = False
        theState=observation.intArray[0]
        s = self.valid_states.index(theState) # row index

        # Choose either a primitive action or option
        a = self.egreedy(s)

        if a<self.numActions:
            # Primitive action
            thisIntAction = a
            self.optionCurrentlyOn = False
            print 'Primitive action chosen'

        else:    
            # Composing an option from S_i to S_j
            self.optionCurrentlyOn = True
            self.currentOptionTime = 0
            self.curentOptionStartState = s
            self.currentOptionReward = 0.0

            # 1. Find the abstract state you belong to & going to
            self.option_S_i = self.absStateMembership[s] # initiation step
            self.option_S_j = a-self.numActions # actually, we will have to choose S_j based on SMDP

            #print 'Shape of first term: ',self.p_mat[s][0].shape
            #print self.option_S_j
            #print 'Shape of second term: ', (self.chi_mat.T[self.option_S_j]).T.shape

            #print 'Debug:'
            #print self.chi_mat[0,0]

            # 2. Choose action based on membership ascent
            thisIntAction=1
            maxVal = 0
            for a in xrange(4): 
                print 'Action: ',a,' ',max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                action_pref = max(self.normalizationC*(np.sum(np.dot(np.array(self.p_mat[s][a]),np.array(self.chi_mat.T[self.option_S_j].T))) - self.chi_mat[s,self.option_S_j]),0)
                if action_pref > maxVal:
                    thisIntAction = a
                    maxVal = action_pref
                print 'Option chosen'

            self.currentOptionTime += 1

        print 'Action chosen: ',thisIntAction

        returnAction=Action()
        returnAction.intArray=[thisIntAction]
        
        self.lastAction=copy.deepcopy(returnAction)
        self.lastObservation=copy.deepcopy(observation)

        self.episode += 1
        return returnAction
Esempio n. 56
0
    def agent_start(self, Obs):
        State = Obs.intArray[0]
        action = self.epsilon_greedy(State)
        returnaction = Action()
        returnaction.intArray = [action]
        self.lastaction = copy.deepcopy(returnaction)
        self.lastObs = copy.deepcopy(Obs)

        #might need to return something
        return returnaction
Esempio n. 57
0
    def agent_start(self, observation):
        theState = observation.intArray[0]
        thisIntAction = self.egreedy(theState)
        returnAction = Action()
        returnAction.intArray = [thisIntAction]

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        return returnAction