class Bot_RL_MLP (Bot): def __init__ (self, size_x, size_y, beta, hidden, learning_rate, reward): Bot.__init__(self) self.bot_name = "Bot_RL_MLP" self.mlp = MLP (size_x * size_y, hidden, size_x * size_y, learning_rate) self.reward = reward[:] #hoher Wert für beta (50?): exploitation #niedriger Wert für beta : exploration self.beta = beta """ Returns an action depending on the given world """ def get_action(self, world): self.info = world.get_sensor_info() self.h = self.mlp.get_action(self.info) for i in range(len(self.h)): if (self.info[i] > 0): self.h[i] = -10 #Workaround: Wenn nur noch 1 Zug möglich ist, automatisch setzen moves = world.get_moves() if (len(moves) == 1): self.act = moves[0] else: #Auswahl wiederholen bis ein gültiger Zug ausgewählt wurde validation = False while (validation == False): self.act = self.rand_winner (self.h, self.beta) # choose action #print self.info, self.act x = self.act % world.size_x y = self.act / world.size_y validation = world.check_action(x, y) #Umrechnen 1D -> 2D x = self.act % world.size_x y = self.act / world.size_y #print "--------------------------" #print self.h, "->", self.act, "->", x, ",", y #print "--------------------------" return (x, y) """ Adapts the MLP considering the results (world_new) of its last action """ def evaluate_action(self, world_new): #Erstellen des Aktions-Vektors act_vec = np.zeros (self.mlp.input_size) act_vec[self.act] = 1.0 #Berechnen der Q-Werte vor und nach der Aktion q0 = self.h[self.act] q1 = self.mlp.get_action(world_new.get_sensor_info())[self.act] #Berechnen der Belohnung auf dem neuen Feld r = self.get_reward(world_new.get_winner()) # read reward if (r == self.get_reward(1)): # This is cleaner than defining target = r # target as r + 0.9 * q1, else: # because weights now converge. target = 0.9 * q1 # gamma = 0.9 delta = target - q0 # prediction error #Wichtig : nur das delta an der Position der Aktion wird als Fehler betrachtet, für alle anderen #Positionen ist der Fehler 0 error = np.zeros (self.mlp.input_size) error[self.act] = delta #Wichtig : Das Lernen erfolgt mittels des Fehlers und der Welt VOR der Aktion self.mlp.evaluate_action(self.info, error) """ Selects an action """ def rand_winner (self, S_from, beta): #for i in range (len(S_from)): # if S_from[i] > 200: # print S_from # time.sleep(0.2) #print "--------------------\n",S_from #time.sleep(0.2) sum = 0.0 p_i = 0.0 rnd = np.random.random() d_r = len (S_from) sel = 0 try: for i in range (d_r): sum += np.exp (beta * min(S_from[i],200)) #if field is empty, set reward to 1 for all fields #to get a probablity higher than 0 if (sum == 0): sum = d_r S_from = [1]*d_r for i in range (d_r): p_i += np.exp (beta * min(S_from[i],200)) / sum if p_i > rnd: sel = i rnd = 1.1 # out of reach, so the next will not be turned ON except Exception: print beta, S_from[i], S_from, sum return sel """ Calculates the reward for the actual board setup """ def get_reward (self, winner): if ((winner >= 0) and (winner <= 2)): return self.reward[int(winner)] else: return 0.0 """ Loads """ #def load_data(self, filename): # fo = open(filename , "r") # #self.w_mot = json_tricks.load(fo.read())["w_mot"] # data = json_tricks.load(fo.read()) # fo.close() # # return data """ Saves """ #def save_data(self, filename): # data = {"bot" : "Bot_RL_MLP", # "version" : 1, # "mlp" : self.mlp} # fo = open(filename , "w") # fo.write(json_tricks.dumps(data)) # fo.close()