def agent_step(self,reward, observation): self.states_diff_list.append([a - b for (a, b) in zip (observation.doubleArray, self.lastObservation.doubleArray)]) self.lastObservation=copy.deepcopy(observation) self.approximateValueFunction() #end of test print reward #test how reward approximation works self.approximateRewardFunction(reward,observation) #end of test thisDoubleAction=self.approximateAction() returnAction=Action() returnAction.doubleArray = thisDoubleAction #approximate value function self.action_list.append(thisDoubleAction) #end of test self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() # this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.fprop(observation_matrix) return_action = Action() return_action.doubleArray = actions self.last_action = copy.deepcopy(actions) self.last_observation = observation.doubleArray return return_action
def agent_step(self,reward, observation): import math # try to set the epsilon inverse of reward that we have got if(self.Episode_Counter >10000): self.Epsilon = 0.0 self.Steps +=1 print reward thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self,reward, observation): import math if(self.Episode_Counter>5000): self.Epsilon = 0 self.Steps +=1 print reward thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter = 0 self.batch_counter = 0 # We report the mean loss for every epoch. self.loss_averages = [] self.start_time = time.time() #this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.fprop(observation_matrix) return_action = Action() return_action.doubleArray = [actions] self.last_action = copy.deepcopy(return_action) self.last_observation = observation.doubleArray return return_action
def agent_start(self,observation): self.P = np.asarray([[0.0 for j in range(self.N_AC)] for i in range(self.N_PC)]) theState=observation.doubleArray if dynamicEpsilon=='1': self.q_epsilon = 0.3-0.005*self.episode else: self.q_epsilon = 0.3 r_PC = self.getProbGaussians(theState[0], theState[1]) res = self.egreedy(theState, r_PC) phi_AC = res[0] r_1_AC = res[1] r_2_AC = [] for i in xrange(self.N_AC): r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) )) # Update P_ij for i in xrange(self.N_AC): for j in xrange(self.N_PC): self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j] returnAction=Action() returnAction.doubleArray=[phi_AC] # finding closest AC closest_AC = r_2_AC.index(max(r_2_AC)) self.lastQ = r_1_AC[closest_AC] self.episode += 1 return returnAction
def agent_step(self,reward, observation): self.reward += reward self.step += 1 self.total_reward += reward thisDoubleAction=self.agent_step_action(observation.doubleArray) if(self.isRisk(observation.doubleArray,thisDoubleAction)): self.times += 1 thisDoubleAction = util.baselinePolicy(observation.doubleArray) from pybrain.supervised.trainers import BackpropTrainer from pybrain.datasets import SupervisedDataSet ds = SupervisedDataSet(12, 4) ds.addSample(observation.doubleArray,self.best.activate(observation.doubleArray)) trainer = BackpropTrainer(self.network, ds) trainer.train() returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) self.lastReward = reward return returnAction
def agent_step(self,reward, observation): theState=observation.doubleArray r_PC = self.getProbGaussians(theState[0], theState[1]) res = self.egreedy(theState, r_PC) phi_AC = res[0] r_1_AC = res[1] r_2_AC = [] for i in xrange(self.N_AC): r_2_AC.append(math.exp( (-1*(phi_AC - 2*math.pi*i/self.N_AC)**2)/(2*self.sigma_AC**2) )) # Calculate reward prediction error delta = reward + self.q_gamma*max(r_1_AC) - self.lastQ #print self.q_gamma*max(r_1_AC), self.lastQ # Update synaptic weights for i in xrange(self.N_PC): for j in xrange(self.N_AC): self.W[i,j] = self.q_stepsize * delta * self.P[i,j] # Update P_ij for i in xrange(self.N_AC): for j in xrange(self.N_PC): self.P[j,i] = self.q_stepsize*self.P[j,i] + r_2_AC[i]*r_PC[j] returnAction=Action() returnAction.doubleArray=[phi_AC] # finding closest AC closest_AC = r_2_AC.index(max(r_2_AC)) self.lastQ = r_1_AC[closest_AC] self.episode += 1 return returnAction
def agent_start(self, observation): """ This method is called once at the beginning of each episode. No reward is provided, because reward is only available after an action has been taken. Arguments: observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ # this_int_action = self.randGenerator.randint(0, self.num_actions-1) observation_matrix = np.asmatrix(observation.doubleArray, dtype='float32') actions = self.action_network.predict(observation_matrix) return_action = Action() return_action.doubleArray = actions self.last_action = copy.deepcopy(actions) self.last_state = np.asmatrix(observation.doubleArray, dtype=floatX) return return_action
def agent_step(self,reward, observation): #Generate random action, 0 or 1 #print "actual===reward",reward #print "actual observation==== ",observation.doubleArray #approximate value function self.lastObservation=copy.deepcopy(observation) self.next_observation_list.append(observation.doubleArray) self.approximateKernelFunction() #end of test print reward #test how reward approximation works self.approximateRewardFunction(reward,observation) #end of test thisDoubleAction=self.approximateAction() returnAction=Action() returnAction.doubleArray = thisDoubleAction #approximate value function self.action_list.append(thisDoubleAction) self.last_observation_list.append(observation.doubleArray) #end of test self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self,reward, observation): import math self.Rewards += reward # function sigmoid if(self.Episode_Counter>Training_Runs): self.Epsilon = 0 self.Steps +=1 thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self,reward, observation): import math # function sigmoid if(self.Episode_Counter>10000): self.Epsilon = 0 else: self.Epsilon = util.randomSigmoidEpsilon(reward,0.02,50) self.Steps +=1 print reward thisDoubleAction=self.agent_action_step(reward,observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_step(self, reward, observation): self.stepCount = self.stepCount + 1 action = Action() action.intArray = observation.intArray action.doubleArray = observation.doubleArray action.charArray = observation.charArray return action
def agent_start(self,observation): self.stepCount=0 action=Action() action.intArray=observation.intArray action.doubleArray=observation.doubleArray action.charArray=observation.charArray return action
def agent_step(self,reward, observation): self.stepCount=self.stepCount+1 action=Action() action.intArray=observation.intArray action.doubleArray=observation.doubleArray action.charArray=observation.charArray return action
def agent_start(self, observation): self.stepCount = 0 action = Action() action.intArray = observation.intArray action.doubleArray = observation.doubleArray action.charArray = observation.charArray return action
def agent_step(self, reward, observation): # Observation obs_array = np.array(observation.doubleArray) #print "state: %3f %3f %3f %3f" % (obs_array[0],obs_array[1],obs_array[2],obs_array[3]) # Compose State : 4-step sequential observation #self.state = self.rescale_value(obs_array) self.state = obs_array #print("state2:"+str(self.state)) #print "state: %3f %3f %3f %3f" % (self.state[0],self.state[1],self.state[2],self.state[3]) state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1,12), dtype=np.float32)) #print("state2_:"+str(state_)) # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0/10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d / %d steps" % (self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Generate an Action by e-greedy action selection returnAction = Action() action = self.DQN.e_greedy(state_, eps) #print(str(action)) returnAction.doubleArray = action[0].tolist() # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, np.asarray(self.lastAction.doubleArray,dtype=np.float32), reward, self.state, False) self.DQN.experienceReplay(self.time) # Target model update # if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: # print "########### MODEL UPDATED ######################" # self.DQN.hard_target_model_update() # Simple text based visualization print 'Time Step %d / ACTION %s / REWARD %.5f / EPSILON %.5f' % (self.time,str(action[0]),reward,eps) # Updates for next step self.last_observation = obs_array if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
def _getAction(self): """ Return a RLGlue action that is made out of a numpy array yielded by the hold pybrain agent. """ action = RLGlueAction() action.doubleArray = self.agent.getAction().tolist() action.intArray = [] return action
def exp_step(self, reward, observation, is_testing): return_action = Action() cur_observation = self._scale_inputs(observation.doubleArray, self.observation_ranges) double_action = self._choose_action(cur_observation, self.action_stdev, self.noise_stdev) loss = None if not is_testing: loss = self._do_training(np.asmatrix(reward, dtype=floatX), cur_observation, double_action, False) return_action.doubleArray = [copy.deepcopy(double_action)] return return_action if is_testing else (return_action, loss)
def agent_step(self, reward, observation): """ This method is called each time step. Arguments: reward - Real valued reward. observation - An observation of type rlglue.types.Observation Returns: An action of type rlglue.types.Action """ self.step_counter += 1 return_action = Action() cur_observation = observation.doubleArray #TESTING--------------------------- if self.testing: self.total_reward += reward double_action = self._choose_action(self.test_data_set, cur_observation, np.clip(reward, -1, 1)) # if self.pause > 0: # time.sleep(self.pause) #NOT TESTING--------------------------- else: double_action = self._choose_action(self.data_set, cur_observation, np.clip(reward, -1, 1), self.action_stdev) if len(self.data_set) > self.batch_size: loss = self._do_training() self.batch_counter += 1 self.loss_averages.append(loss) return_action.doubleArray = [double_action] self.last_action = copy.deepcopy(return_action) self.last_observation = cur_observation return_action.doubleArray = [double_action * 2 - 1] return return_action
def agent_start(self, observation): # print "Observation: ",observation.doubleArray returnAction = Action() returnAction.doubleArray = self.agent_policy(observation) self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) self.write_data(observation.doubleArray, "observation") return returnAction
def exp_step(self, reward, observation, is_testing): return_action = Action() cur_observation = observation.doubleArray double_action = self._choose_action(cur_observation, np.clip(reward, -1, 1), self.action_stdev) loss = None if not is_testing: loss = self._do_training() self.last_action = copy.deepcopy(double_action) self.last_observation = cur_observation return_action.doubleArray = [double_action] return return_action if is_testing else (return_action, loss)
def create_action(self,act): self.last_act=act if np.isscalar(act): act = np.array([act]) assert (act.size == self.action_dims()),'illegal action dimension' return_action=Action() if self.int_action_dims() > 0: return_action.intArray=[act[:self.int_action_dims()].astype(int)] if self.double_action_dims() > 0: return_action.doubleArray=[ act[self.double_action_dims():].astype(float)] return return_action
def agent_step(self, reward, observation): # print "Observation: ",observation.doubleArray # print "Reward: ",reward returnAction = Action() returnAction.doubleArray = self.agent_policy(observation) self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) self.write_data(observation.doubleArray, "observation") self.write_data([reward], "reward") return returnAction
def postprocess_actions(self, action_values): action = Action() if self.continuous_actions: doubleactions = copy.deepcopy(action_values) action.doubleArray = doubleactions.tolist()[0] else: intactions = copy.deepcopy(action_values) minranges = self.action_ranges[:, 0].T maxranges = self.action_ranges[:, 1].T intactions = np.maximum(intactions, minranges) intactions = np.minimum(intactions, maxranges) intactions = np.rint(intactions).astype(np.int) action.intArray = intactions.tolist()[0] return action
def agent_start(self,observation): #Generate random action, 0 or 1 global mutation_rate,variance,flag,changeTopology self.lastAction = None self.lastObservation = None self.reward = 0.0 self.episode_counter += 1.0 self.step = 1.0 self.times = 1 #mutation_rate = mutation_rate*decay_rate if(changeTopology): print "another topology" mutation_rate = 0.1 variance = 0.005 flag = "two" from pybrain.supervised.trainers import BackpropTrainer self.seed = StateToActionNetwork() trainer = BackpropTrainer(self.seed, self.ds,learningrate=0.3) for i in range(0,10): trainer.trainEpochs(epochs=5) print "after training" self.generation = self.firstGeneration() self.best = self.seed changeTopology = False if(self.generation[15].fittness >= 0.5 and flag =="one"): self.network = self.generation[15].network changeTopology = True else: self.network = self.getUnevaluatedGenome().network self.ds = SupervisedDataSet(12, 4) thisDoubleAction=self.agent_step_action(observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def getAction(dir, isJump, isSpeed): #-1, 0, 1 for direction, 1 is to the right #0, 1 for jump #0, 1 for speed action = Action() action.numInts = 3 action.numDoubles = 0 action.numChars = 0 action.intArray = [] action.doubleArray = [] action.charArray = [] action.intArray.append(dir) action.intArray.append(isJump) action.intArray.append(isSpeed) return action
def agent_step(self,reward, observation): print reward thisDoubleAction=self.agent_step_action(observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_start(self,observation): #Generate random action, 0 or 1 self.lastAction = None self.lastObservation = None print " " thisDoubleAction=self.agent_step_action(observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def agent_step(self,reward, observation): #Generate random action, 0 or 1 print reward thisDoubleAction=self.randomAction() returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastObservation=copy.deepcopy(observation) self.lastAction=copy.deepcopy(returnAction) return returnAction
def agent_start(self,observation): #Generate random action, 0 or 1 print " " thisDoubleAction=self.approximateAction() returnAction=Action() returnAction.doubleArray = thisDoubleAction #test for approximate value function # self.last_observation_list.append(observation.doubleArray) self.action_list.append(thisDoubleAction) #end of test self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def agent_start(self,observation): self.Episode_Counter +=1 self.lastAction = None self.lastObservation = None self.Steps = 1 print " " thisDoubleAction=self.agent_action_start(observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def select_action(self, policy, state): """ Action selection based on linear policies """ state_vector = np.array(state.doubleArray) action_vector = np.matmul(policy, state_vector) # print(f"State: {state_vector}") # print(f"Action: {action_vector}") # Constraint for i in range(len(action_vector)): if action_vector[i] > self.max_u: action_vector[i] = self.max_u elif action_vector[i] < -self.max_u: action_vector[i] = -self.max_u action_selected = Action(numDoubles=action_vector.size) action_selected.doubleArray = action_vector.tolist() return action_selected
def agent_start(self,observation): #Generate random action, 0 or 1 global mutation_rate,variance self.lastAction = None self.lastObservation = None self.reward = 0.0 self.episode_counter += 1.0 self.step = 1.0 self.times = 1 mutation_rate = mutation_rate*decay_rate self.network = self.getUnevaluatedGenome().network thisDoubleAction=self.agent_step_action(observation.doubleArray) returnAction=Action() returnAction.doubleArray = thisDoubleAction self.lastAction=copy.deepcopy(returnAction) self.lastObservation=copy.deepcopy(observation) return returnAction
def agent_start(self, observation): # Observation obs_array = np.array(observation.doubleArray) # Initialize State #self.state = self.rescale_value(obs_array) self.state = obs_array #print("state1:"+str(self.state)) state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1,12), dtype=np.float32)) # Generate an Action e-greedy returnAction = Action() action = self.DQN.e_greedy(state_, self.epsilon) #print(str(action)) returnAction.doubleArray = action[0].tolist() # Update for next step self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.last_observation = obs_array return returnAction