def agent_step(self,reward, observation):
		self.states_diff_list.append([a - b for (a, b) in zip (observation.doubleArray, self.lastObservation.doubleArray)])

		self.lastObservation=copy.deepcopy(observation)
		 
		self.approximateValueFunction()		
		#end of test
		print reward
		#test how reward approximation works
		self.approximateRewardFunction(reward,observation)
		 
		#end of test

		thisDoubleAction=self.approximateAction()  
		  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		#approximate value function 
		self.action_list.append(thisDoubleAction)
		#end of test


		
		
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        # this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray,
                                         dtype='float32')
        actions = self.action_network.fprop(observation_matrix)
        return_action = Action()
        return_action.doubleArray = actions

        self.last_action = copy.deepcopy(actions)

        self.last_observation = observation.doubleArray

        return return_action
	def agent_step(self,reward, observation):
		
		import math
	
		 
		# try to set the epsilon inverse of reward that we have got 
		
		if(self.Episode_Counter >10000):
			self.Epsilon = 0.0
 		
		self.Steps +=1
		

		print reward 

		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
	def agent_step(self,reward, observation):
		
		import math
	
	 
		if(self.Episode_Counter>5000):
			self.Epsilon = 0		
	 
 		
		self.Steps +=1
		

		print reward 

		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        self.step_counter = 0
        self.batch_counter = 0

        # We report the mean loss for every epoch.
        self.loss_averages = []

        self.start_time = time.time()
        #this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32')
        actions = self.action_network.fprop(observation_matrix)
        return_action = Action()
        return_action.doubleArray = [actions]

        self.last_action = copy.deepcopy(return_action)

        self.last_observation = observation.doubleArray

        return return_action
Exemple #6
0
    def agent_start(self,observation):
        self.P = np.asarray([[0.0 for j in range(self.N_AC)] for i in range(self.N_PC)])

        theState=observation.doubleArray

        if dynamicEpsilon=='1':
            self.q_epsilon = 0.3-0.005*self.episode
        else:
            self.q_epsilon = 0.3

        r_PC = self.getProbGaussians(theState[0], theState[1]) 
        res = self.egreedy(theState, r_PC)
        phi_AC = res[0]
        r_1_AC = res[1]
        r_2_AC = []
        for i in xrange(self.N_AC):
            r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) ))

        # Update P_ij
        for i in xrange(self.N_AC):
            for j in xrange(self.N_PC):
                self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j]

        returnAction=Action()
        returnAction.doubleArray=[phi_AC]
        
        # finding closest AC
        closest_AC = r_2_AC.index(max(r_2_AC))
        self.lastQ = r_1_AC[closest_AC]

        self.episode += 1

        return returnAction
	def agent_step(self,reward, observation): 
		 		
		self.reward += reward
		self.step += 1
	        self.total_reward += reward
		
		thisDoubleAction=self.agent_step_action(observation.doubleArray)  
		if(self.isRisk(observation.doubleArray,thisDoubleAction)):
		 	self.times += 1
			thisDoubleAction = util.baselinePolicy(observation.doubleArray)  
			from pybrain.supervised.trainers import BackpropTrainer
			from pybrain.datasets import SupervisedDataSet
			ds = SupervisedDataSet(12, 4)
			ds.addSample(observation.doubleArray,self.best.activate(observation.doubleArray))	
			trainer = BackpropTrainer(self.network, ds)
			trainer.train()
  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		self.lastReward = reward 
		return returnAction
Exemple #8
0
    def agent_step(self,reward, observation):
        theState=observation.doubleArray

        r_PC = self.getProbGaussians(theState[0], theState[1])    
        res = self.egreedy(theState, r_PC)
        phi_AC = res[0]
        r_1_AC = res[1]
        r_2_AC = []
        for i in xrange(self.N_AC):
            r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) ))

        # Calculate reward prediction error
        delta = reward + self.q_gamma*max(r_1_AC) - self.lastQ
        #print self.q_gamma*max(r_1_AC), self.lastQ

        # Update synaptic weights
        for i in xrange(self.N_PC):
            for j in xrange(self.N_AC):       
                self.W[i,j] = self.q_stepsize * delta * self.P[i,j]

        # Update P_ij
        for i in xrange(self.N_AC):
            for j in xrange(self.N_PC):
                self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j]

        returnAction=Action()
        returnAction.doubleArray=[phi_AC]
        
        # finding closest AC
        closest_AC = r_2_AC.index(max(r_2_AC))
        self.lastQ = r_1_AC[closest_AC]

        self.episode += 1

        return returnAction
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        # this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray,
                                         dtype='float32')
        actions = self.action_network.predict(observation_matrix)

        return_action = Action()
        return_action.doubleArray = actions

        self.last_action = copy.deepcopy(actions)
        self.last_state = np.asmatrix(observation.doubleArray, dtype=floatX)

        return return_action
	def agent_step(self,reward, observation):
		#Generate random action, 0 or 1
		#print "actual===reward",reward
		#print "actual observation====  ",observation.doubleArray
		#approximate value function		
		self.lastObservation=copy.deepcopy(observation)
		 
		self.next_observation_list.append(observation.doubleArray)               
		self.approximateKernelFunction()		
		#end of test
		print reward
		#test how reward approximation works
		self.approximateRewardFunction(reward,observation)
		 
		#end of test

		thisDoubleAction=self.approximateAction()  
		  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		#approximate value function 
		self.action_list.append(thisDoubleAction)
		self.last_observation_list.append(observation.doubleArray)
		#end of test


		
		
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
	def agent_step(self,reward, observation):
		
		import math
	
		self.Rewards += reward 

		# function sigmoid
		if(self.Episode_Counter>Training_Runs):
			self.Epsilon = 0		
	 
 		
		self.Steps +=1
		
 
		
		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
	def agent_step(self,reward, observation):
		
		import math
	
		 
		# function sigmoid
		if(self.Episode_Counter>10000):
			self.Epsilon = 0		
		else:
			self.Epsilon =   util.randomSigmoidEpsilon(reward,0.02,50)
 		
		self.Steps +=1
		

		print reward 

		thisDoubleAction=self.agent_action_step(reward,observation.doubleArray)  
				 
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		
	 	 

		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
Exemple #13
0
    def agent_step(self, reward, observation):
        self.stepCount = self.stepCount + 1
        action = Action()
        action.intArray = observation.intArray
        action.doubleArray = observation.doubleArray
        action.charArray = observation.charArray

        return action
    def agent_start(self,observation):
        self.stepCount=0
        action=Action()
        action.intArray=observation.intArray
        action.doubleArray=observation.doubleArray
        action.charArray=observation.charArray

        return action
    def agent_step(self,reward, observation):
        self.stepCount=self.stepCount+1
        action=Action()
        action.intArray=observation.intArray
        action.doubleArray=observation.doubleArray
        action.charArray=observation.charArray

        return action
Exemple #16
0
    def agent_start(self, observation):
        self.stepCount = 0
        action = Action()
        action.intArray = observation.intArray
        action.doubleArray = observation.doubleArray
        action.charArray = observation.charArray

        return action
Exemple #17
0
    def agent_step(self, reward, observation):

        # Observation
        obs_array = np.array(observation.doubleArray)
        #print "state: %3f %3f %3f %3f" % (obs_array[0],obs_array[1],obs_array[2],obs_array[3])
        # Compose State : 4-step sequential observation
        #self.state = self.rescale_value(obs_array)
        self.state = obs_array
        #print("state2:"+str(self.state))
        #print "state: %3f %3f %3f %3f" % (self.state[0],self.state[1],self.state[2],self.state[3])
        state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1,12), dtype=np.float32))
        #print("state2_:"+str(state_))
        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0/10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d / %d steps" % (self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
                print "Policy is Frozen"
                eps = 0.05

        # Generate an Action by e-greedy action selection
        returnAction = Action()
        action = self.DQN.e_greedy(state_, eps)
        #print(str(action))
        returnAction.doubleArray = action[0].tolist()

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time,
                                     self.last_state,
                                     np.asarray(self.lastAction.doubleArray,dtype=np.float32),
                                     reward,
                                     self.state, False)
            self.DQN.experienceReplay(self.time)

        # Target model update
#        if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
#            print "########### MODEL UPDATED ######################"
#            self.DQN.hard_target_model_update()

        # Simple text based visualization
        print 'Time Step %d / ACTION  %s / REWARD %.5f / EPSILON  %.5f' % (self.time,str(action[0]),reward,eps)

        # Updates for next step
        self.last_observation = obs_array

        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
Exemple #18
0
 def _getAction(self):
     """
     Return a RLGlue action that is made out of a numpy array yielded by
     the hold pybrain agent.
     """
     action = RLGlueAction()
     action.doubleArray = self.agent.getAction().tolist()
     action.intArray = []
     return action
 def exp_step(self, reward, observation, is_testing):
     return_action = Action()
     cur_observation = self._scale_inputs(observation.doubleArray, self.observation_ranges)
     double_action = self._choose_action(cur_observation, self.action_stdev, self.noise_stdev)
     loss = None
     if not is_testing:
         loss = self._do_training(np.asmatrix(reward, dtype=floatX), cur_observation, double_action, False)
     return_action.doubleArray = [copy.deepcopy(double_action)]
     return return_action if is_testing else (return_action, loss)
Exemple #20
0
 def _getAction(self):
     """
     Return a RLGlue action that is made out of a numpy array yielded by
     the hold pybrain agent.
     """
     action = RLGlueAction()
     action.doubleArray = self.agent.getAction().tolist()
     action.intArray = []
     return action
Exemple #21
0
    def agent_step(self, reward, observation):
        """
        This method is called each time step.

        Arguments:
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action

        """

        self.step_counter += 1
        return_action = Action()
        cur_observation = observation.doubleArray

        #TESTING---------------------------
        if self.testing:
            self.total_reward += reward
            double_action = self._choose_action(self.test_data_set,
                                                cur_observation,
                                                np.clip(reward, -1, 1))
            # if self.pause > 0:
            #     time.sleep(self.pause)

        #NOT TESTING---------------------------
        else:
            double_action = self._choose_action(self.data_set, cur_observation,
                                                np.clip(reward, -1, 1),
                                                self.action_stdev)

            if len(self.data_set) > self.batch_size:
                loss = self._do_training()
                self.batch_counter += 1
                self.loss_averages.append(loss)

        return_action.doubleArray = [double_action]

        self.last_action = copy.deepcopy(return_action)
        self.last_observation = cur_observation

        return_action.doubleArray = [double_action * 2 - 1]
        return return_action
    def agent_start(self, observation):
        # print "Observation: ",observation.doubleArray
        returnAction = Action()
        returnAction.doubleArray = self.agent_policy(observation)

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        self.write_data(observation.doubleArray, "observation")
        return returnAction
Exemple #23
0
    def agent_step(self, reward, observation):
        """
        This method is called each time step.

        Arguments:
           reward      - Real valued reward.
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action

        """

        self.step_counter += 1
        return_action = Action()
        cur_observation = observation.doubleArray

        #TESTING---------------------------
        if self.testing:
            self.total_reward += reward
            double_action = self._choose_action(self.test_data_set,
                                             cur_observation, np.clip(reward, -1, 1))
            # if self.pause > 0:
            #     time.sleep(self.pause)

        #NOT TESTING---------------------------
        else:
            double_action = self._choose_action(self.data_set, cur_observation,
                                             np.clip(reward, -1, 1), self.action_stdev)

            if len(self.data_set) > self.batch_size:
                loss = self._do_training()
                self.batch_counter += 1
                self.loss_averages.append(loss)

        return_action.doubleArray = [double_action]

        self.last_action = copy.deepcopy(return_action)
        self.last_observation = cur_observation

        return_action.doubleArray = [double_action * 2 - 1]
        return return_action
 def exp_step(self, reward, observation, is_testing):
     return_action = Action()
     cur_observation = observation.doubleArray
     double_action = self._choose_action(cur_observation, np.clip(reward, -1, 1), self.action_stdev)
     loss = None
     if not is_testing:
         loss = self._do_training()
     self.last_action = copy.deepcopy(double_action)
     self.last_observation = cur_observation
     return_action.doubleArray = [double_action]
     return return_action if is_testing else (return_action, loss)
 def exp_step(self, reward, observation, is_testing):
     return_action = Action()
     cur_observation = self._scale_inputs(observation.doubleArray,
                                          self.observation_ranges)
     double_action = self._choose_action(cur_observation, self.action_stdev,
                                         self.noise_stdev)
     loss = None
     if not is_testing:
         loss = self._do_training(np.asmatrix(reward, dtype=floatX),
                                  cur_observation, double_action, False)
     return_action.doubleArray = [copy.deepcopy(double_action)]
     return return_action if is_testing else (return_action, loss)
Exemple #26
0
 def create_action(self,act):
     self.last_act=act
     if np.isscalar(act):
         act = np.array([act])
     assert (act.size == self.action_dims()),'illegal action dimension'
     return_action=Action()
     if self.int_action_dims() > 0:
         return_action.intArray=[act[:self.int_action_dims()].astype(int)] 
     if self.double_action_dims() > 0:
         return_action.doubleArray=[
             act[self.double_action_dims():].astype(float)]
     return return_action
    def agent_step(self, reward, observation):
        # print "Observation: ",observation.doubleArray
        # print "Reward: ",reward
        returnAction = Action()
        returnAction.doubleArray = self.agent_policy(observation)

        self.lastAction = copy.deepcopy(returnAction)
        self.lastObservation = copy.deepcopy(observation)

        self.write_data(observation.doubleArray, "observation")
        self.write_data([reward], "reward")
        return returnAction
Exemple #28
0
 def create_action(self,act):
     self.last_act=act
     if np.isscalar(act):
         act = np.array([act])
     assert (act.size == self.action_dims()),'illegal action dimension'
     return_action=Action()
     if self.int_action_dims() > 0:
         return_action.intArray=[act[:self.int_action_dims()].astype(int)] 
     if self.double_action_dims() > 0:
         return_action.doubleArray=[
             act[self.double_action_dims():].astype(float)]
     return return_action
Exemple #29
0
 def exp_step(self, reward, observation, is_testing):
     return_action = Action()
     cur_observation = observation.doubleArray
     double_action = self._choose_action(cur_observation,
                                         np.clip(reward, -1, 1),
                                         self.action_stdev)
     loss = None
     if not is_testing:
         loss = self._do_training()
     self.last_action = copy.deepcopy(double_action)
     self.last_observation = cur_observation
     return_action.doubleArray = [double_action]
     return return_action if is_testing else (return_action, loss)
Exemple #30
0
 def postprocess_actions(self, action_values):
     action = Action()
     if self.continuous_actions:
         doubleactions = copy.deepcopy(action_values)
         action.doubleArray = doubleactions.tolist()[0]
     else:
         intactions = copy.deepcopy(action_values)
         minranges = self.action_ranges[:, 0].T
         maxranges = self.action_ranges[:, 1].T
         intactions = np.maximum(intactions, minranges)
         intactions = np.minimum(intactions, maxranges)
         intactions = np.rint(intactions).astype(np.int)
         action.intArray = intactions.tolist()[0]
     return action
	def agent_start(self,observation):
		#Generate random action, 0 or 1  
		global mutation_rate,variance,flag,changeTopology
		self.lastAction = None
		self.lastObservation = None
		self.reward = 0.0
		self.episode_counter += 1.0
		self.step = 1.0
		self.times = 1
		

		#mutation_rate =  mutation_rate*decay_rate
		if(changeTopology):
			print "another topology"
			mutation_rate = 0.1
			variance = 0.005
			flag = "two"
			from pybrain.supervised.trainers import BackpropTrainer
			self.seed = StateToActionNetwork()
			trainer = BackpropTrainer(self.seed, self.ds,learningrate=0.3)	
			for i in range(0,10):
				trainer.trainEpochs(epochs=5)				 
				        
			print "after training"
			self.generation = self.firstGeneration()
			self.best = self.seed
			changeTopology = False
				
		if(self.generation[15].fittness >= 0.5 and  flag =="one"):
			self.network = self.generation[15].network
			changeTopology = True

		else:
			self.network = self.getUnevaluatedGenome().network
		
		self.ds = SupervisedDataSet(12, 4)

		thisDoubleAction=self.agent_step_action(observation.doubleArray)
		
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
	 

		self.lastAction=copy.deepcopy(returnAction)
		self.lastObservation=copy.deepcopy(observation)
		 

		
		return returnAction
Exemple #32
0
def getAction(dir, isJump, isSpeed):
    #-1, 0, 1 for direction, 1 is to the right
    #0, 1 for jump
    #0, 1 for speed
    action = Action()
    action.numInts = 3
    action.numDoubles = 0
    action.numChars = 0
    action.intArray = []
    action.doubleArray = []
    action.charArray = []
    action.intArray.append(dir)
    action.intArray.append(isJump)
    action.intArray.append(isSpeed)
    return action
	def agent_step(self,reward, observation): 
		 		
		
		print reward
		 
		thisDoubleAction=self.agent_step_action(observation.doubleArray)  
		  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
	def agent_start(self,observation):
		#Generate random action, 0 or 1  
		self.lastAction = None
		self.lastObservation = None

		print " " 
		thisDoubleAction=self.agent_step_action(observation.doubleArray)
		
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
	 

		self.lastAction=copy.deepcopy(returnAction)
		self.lastObservation=copy.deepcopy(observation)
		 
		return returnAction
	def agent_step(self,reward, observation):
		#Generate random action, 0 or 1
		 		
		
		print reward
		 
		thisDoubleAction=self.randomAction()  
		  
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		 
		 
		self.lastObservation=copy.deepcopy(observation)
		self.lastAction=copy.deepcopy(returnAction)
		
		 
		return returnAction
	def agent_start(self,observation):
		#Generate random action, 0 or 1  

		print " " 
		thisDoubleAction=self.approximateAction()
 		
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
		#test for approximate value function		
			
 		# self.last_observation_list.append(observation.doubleArray)
		self.action_list.append(thisDoubleAction)

		#end of test

		self.lastAction=copy.deepcopy(returnAction)
		self.lastObservation=copy.deepcopy(observation)
		 
		return returnAction
	def agent_start(self,observation):
		
		self.Episode_Counter +=1 

		self.lastAction = None
		self.lastObservation = None

		self.Steps = 1 
		print " " 
		thisDoubleAction=self.agent_action_start(observation.doubleArray)
	 

		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
	  

		self.lastAction=copy.deepcopy(returnAction)
		self.lastObservation=copy.deepcopy(observation)
		 
		return returnAction
    def select_action(self, policy, state):
        """
            Action selection based on linear policies
    """
        state_vector = np.array(state.doubleArray)
        action_vector = np.matmul(policy, state_vector)

        # print(f"State: {state_vector}")
        # print(f"Action: {action_vector}")

        # Constraint
        for i in range(len(action_vector)):
            if action_vector[i] > self.max_u:
                action_vector[i] = self.max_u
            elif action_vector[i] < -self.max_u:
                action_vector[i] = -self.max_u

        action_selected = Action(numDoubles=action_vector.size)
        action_selected.doubleArray = action_vector.tolist()

        return action_selected
	def agent_start(self,observation):
		#Generate random action, 0 or 1  
		global mutation_rate,variance
		self.lastAction = None
		self.lastObservation = None
		self.reward = 0.0
		self.episode_counter += 1.0
		self.step = 1.0
		self.times = 1
		mutation_rate =  mutation_rate*decay_rate
		
		self.network = self.getUnevaluatedGenome().network
		thisDoubleAction=self.agent_step_action(observation.doubleArray)
		
		returnAction=Action()
		returnAction.doubleArray = thisDoubleAction
	 

		self.lastAction=copy.deepcopy(returnAction)
		self.lastObservation=copy.deepcopy(observation)
		 
		return returnAction
Exemple #40
0
    def agent_start(self, observation):

        # Observation
        obs_array = np.array(observation.doubleArray)

        # Initialize State
        #self.state = self.rescale_value(obs_array)
        self.state = obs_array
        #print("state1:"+str(self.state))
        state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1,12), dtype=np.float32))
        
        # Generate an Action e-greedy
        returnAction = Action()
        action = self.DQN.e_greedy(state_, self.epsilon)
        #print(str(action))
        returnAction.doubleArray = action[0].tolist()

        # Update for next step
        self.lastAction = copy.deepcopy(returnAction)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return returnAction
    def agent_start(self, observation):
        """
        This method is called once at the beginning of each episode.
        No reward is provided, because reward is only available after
        an action has been taken.

        Arguments:
           observation - An observation of type rlglue.types.Observation

        Returns:
           An action of type rlglue.types.Action
        """

        # this_int_action = self.randGenerator.randint(0, self.num_actions-1)
        observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32')
        actions = self.action_network.predict(observation_matrix)

        return_action = Action()
        return_action.doubleArray = actions

        self.last_action = copy.deepcopy(actions)
        self.last_state = np.asmatrix(observation.doubleArray, dtype=floatX)

        return return_action