class MainKeras(): def __init__(self, missionXML, n_games=500, max_retries=3, starting_zombies=1, XSize=10, ZSize=10, aggregate_episode_every=5, agent_search_resolution=30, load_model=False): # keras attributes self.n_games = n_games self._init_logger() # keras self.n_actions = 4 self.agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=7, n_actions=self.n_actions, mem_size=1000000, batch_size=64, epsilon_end=0.01) self._load_dqn_model(load_model) self.scores = [] self.eps_history = [] self.aggregate_episode_every = aggregate_episode_every # qtable self.Qtb = {} self._load_qtable(load_model) self.epsilon = 0.01 # chance of taking a random action instead of the best # agent self.agent_host = MalmoPython.AgentHost() try: self.agent_host.parse(sys.argv) except RuntimeError as e: print('ERROR:', e) print(self.agent_host.getUsage()) exit(1) # mission self.missionXML = missionXML # self._validate_mission() self.max_retries = max_retries #adding clients self.my_client_pool = None # self._add_starters() self._add_default_client() self.world_state = None # mission generator self.mission_generator = MissionGenerator(self.missionXML) self.starting_zombies = starting_zombies self.num_zombies = starting_zombies self.zombie_difference = 0 # for reward calculation self.XSize = XSize self.ZSize = ZSize # canvas self.visual = Visualizer(arena_width=self.XSize, arena_breadth=self.ZSize) # direction learner variables self.agent_search_resolution = agent_search_resolution self.agent_stepsize = 1 self.agent_turn_weight = 100 self.agent_edge_weight = -100 self.agent_mob_weight = -10 self.agent_turn_weight = 0 # Negative values to penalise turning, positive to encourage. self.turning_diff = 0 # for visualization self.flash = False self.current_life = 0 # main loop variables self.self_x = 0 self.self_z = 0 self.current_yaw = 0 self.ob = None self.all_zombies_dead = False self.num_heals = 0 self.life_decrease_penalty = 0 self.TimeAlive = 0 self.time_rewards = 0 self.heal_rewards = 0 self.move_backwards_reward = 0 def _init_logger(self): self.logger = logging.getLogger(__name__) if False: # True if you want to see more information self.logger.setLevel(logging.DEBUG) else: self.logger.setLevel(logging.INFO) self.logger.handlers = [] self.logger.addHandler(logging.StreamHandler(sys.stdout)) def _load_dqn_model(self, load_model): if load_model == True: self.agent.load_model() def _load_qtable(self, load_model): if load_model == True: with open('QTable.txt') as json_file: Qtb = json.load(json_file) self.Qtb = Qtb def _exportQTable(self): with open('QTable.txt', 'w') as outfile: json.dump(self.Qtb, outfile) def _updateQTable(self, reward, current_state): """Change q_table to reflect what we have learnt.""" # retrieve the old action value from the Q-table (indexed by the previous state and the previous action) old_q = self.Qtb[self.prev_s][self.prev_a] # TODO: what should the new action value be? try to modify my calculate reward method new_q = reward # assign the new action value to the Q-table self.Qtb[self.prev_s][self.prev_a] = new_q def _updateQTableFromTerminatingState(self, reward): """Change q_table to reflect what we have learnt, after reaching a terminal state.""" # retrieve the old action value from the Q-table (indexed by the previous state and the previous action) old_q = self.Qtb[self.prev_s][self.prev_a] # TODO: what should the new action value be? new_q = reward # assign the new action value to the Q-table self.Qtb[self.prev_s][self.prev_a] = new_q def _add_default_client(self): self.my_client_pool = MalmoPython.ClientPool() self.my_client_pool.add(MalmoPython.ClientInfo('127.0.0.1', 10000)) def _generate_new_mission(self): self.mission_generator.restartXML() self.xcoords, self.zcoords = self.mission_generator.getCoords( self.XSize, self.ZSize) self.mission_generator.drawEntity("Zombie", self.starting_zombies) self.mission_generator.randomStart() # self.mission_generator.spawnItems() def _add_starters(self): # self.my_mission.removeAllCommandHandlers() self.my_mission.allowAllContinuousMovementCommands() self.my_mission.setViewpoint(0) # self.my_mission.allowAllDiscreteMovementCommands() #self.my_mission.requestVideo( 320, 240 ) use default size instead def _validate_mission(self): self.my_mission = MalmoPython.MissionSpec( self.mission_generator.getXML(), True) # self.my_mission_record = MalmoPython.MissionRecordSpec() def _drawBoundaries(self): for i in range(len(self.xcoords)): self.my_mission.drawLine(self.xcoords[i % len(self.xcoords)], 4, self.zcoords[i % len(self.xcoords)], self.xcoords[(i + 1) % len(self.xcoords)], 4, self.zcoords[(i + 1) % len(self.xcoords)], "fence") def _retry_start_mission(self): self.my_mission_record = MalmoPython.MissionRecordSpec() # self.my_mission_record = malmoutils.get_default_recording_object(self.agent_host, # "Mission_" + str(len(self.scores)-1)) for retry in range(self.max_retries): try: # Attempt to start the mission: self.my_mission.forceWorldReset( ) # force world to reset for each iteration self.agent_host.startMission(self.my_mission, self.my_client_pool, self.my_mission_record, 0, "ZombieKiller") break except RuntimeError as e: if retry == self.max_retries - 1: print("Error starting mission", e) print("Is the game running?") exit(1) else: time.sleep(2) self._get_valid_worldstate() def _start_mission(self): self._generate_new_mission() self._validate_mission() self._add_starters() self._drawBoundaries() self._retry_start_mission() def _get_valid_worldstate(self): # Loop until mission starts: print("Waiting for the mission to start ", end=' ') self.world_state = self.agent_host.getWorldState() while not self.world_state.has_mission_begun: print(".", end="") time.sleep(0.1) self.world_state = self.agent_host.getWorldState() for error in self.world_state.errors: print("Error:", error.text) print() def _assign_observation(self): if self.world_state.number_of_observations_since_last_state > 0: msg = self.world_state.observations[-1].text self.ob = json.loads(msg) def _get_next_observation(self): self.world_state = self.agent_host.getWorldState() if self.world_state.number_of_observations_since_last_state > 0: msg = self.world_state.observations[-1].text return json.loads(msg) return self.ob def _get_position_and_orientation(self): if u'Yaw' in self.ob: self.current_yaw = self.ob[u'Yaw'] if u'XPos' in self.ob: self.self_x = self.ob[u'XPos'] if u'ZPos' in self.ob: self.self_z = self.ob[u'ZPos'] def _calculate_turning_difference_from_zombies(self): x_pull, z_pull, current_yaw = self._get_diagonal_difference_from_zombies( ) yaw = -180 * math.atan2(x_pull, z_pull) / math.pi difference = yaw - current_yaw while difference < -180: difference += 360 while difference > 180: difference -= 360 # print("turn differece: ", difference/180.0) return difference / 180.0 def _get_diagonal_difference_from_zombies(self): if u'entities' in self.ob: entities = self.ob["entities"] # print(f'Entities: {entities}') return self._get_pull_from_entities(entities) def _get_pull_from_entities(self, entities): # Get our position/orientation: if u'Yaw' in self.ob: current_yaw = self.ob[u'Yaw'] if u'XPos' in self.ob: self.self_x = self.ob[u'XPos'] if u'ZPos' in self.ob: self.self_z = self.ob[u'ZPos'] num_zombie, x_pull, z_pull = 0, 0, 0 for e in entities: if e["name"] == "Zombie": num_zombie += 1 # Each zombie contributes to the direction we should head in... dist = max(0.0001, (e["x"] - self.self_x) * (e["x"] - self.self_x) + (e["z"] - self.self_z) * (e["z"] - self.self_z)) # Prioritise going after wounded sheep. Max zombie health is 20, according to Minecraft wiki... weight = 20 / dist x_pull += weight * (e["x"] - self.self_x) / dist z_pull += weight * (e["z"] - self.self_z) / dist return x_pull, z_pull, current_yaw def _check_num_zombies(self): if u'entities' in self.ob: num_zombie = 0 entities = self.ob["entities"] for e in entities: if e["name"] == "Zombie": num_zombie += 1 self._update_num_zombies(num_zombie) def _update_num_zombies(self, new_num_zombies): if new_num_zombies < self.num_zombies: self.zombie_difference = self.num_zombies - new_num_zombies self.num_zombies = new_num_zombies self.num_heals += 1 else: self.zombie_difference = 0 def _get_current_rewards(self, current_rewards): for reward in self.world_state.rewards: current_rewards += reward.getValue() print(f"INSIDE FOR: {reward.getValue()}") # life decrease penalty current_rewards += self.life_decrease_penalty print("life decrease penalty: " + str(self.life_decrease_penalty)) # increase time rewards self._increase_time_reward() current_rewards += self.time_rewards print(f"increase_time: {self.time_rewards}") # healing rewards current_rewards += self.heal_rewards print(f"healing rewards: {self.heal_rewards}") current_rewards += self._kill_zombie_reward() # print(f"kill zombie reward: {self._kill_zombie_reward()}") current_rewards += self.move_backwards_reward current_rewards += self.heal_rewards return current_rewards def _increase_time_reward(self): if "TimeAlive" in self.ob: t = self.ob[u'TimeAlive'] if t > self.TimeAlive: self.time_rewards += ( t - self.TimeAlive) * .2 # life decrease penalty self.TimeAlive = t def _kill_zombie_reward(self): return self.zombie_difference * 100 def _move_towards_zombies(self, difference_from_zombie): self.agent_host.sendCommand("turn " + str(difference_from_zombie)) move_speed = 1.0 if abs( difference_from_zombie ) < 0.5 else 0 # move slower when turning faster - helps with "orbiting" problem self.agent_host.sendCommand("move " + str(move_speed)) self.turning_diff = 0 # print("move " + str(move_speed)) def _move_away_from_zombies(self, difference_from_zombie): self.agent_host.sendCommand("turn " + str(difference_from_zombie)) move_speed = 1.0 if abs( difference_from_zombie ) < 0.5 else 0 # move slower when turning faster - helps with "orbiting" problem self.agent_host.sendCommand("move -" + str(move_speed)) self.turning_diff = 0 # self.move_backwards_reward = -0.45 # print("move -" + str(move_speed)) def _attack(self): self.agent_host.sendCommand("attack 1") self.agent_host.sendCommand("attack 0") print('attack') def _heal(self): if self.num_heals > 0: if self.current_life <= 14: self.heal_rewards += 20 self.agent_host.sendCommand( "chat /effect ZombieKiller instant_health 3") if self.ob[u'Life'] >= 15: self.heal_rewards = -25 else: self.heal_rewards = 20 self.num_heals -= 1 else: self.heal_rewards -= 20 def _translate_actions(self, action_num, difference_from_zombie): if action_num == 0: self._move_away_from_zombies(difference_from_zombie) elif action_num == 1: self._move_towards_zombies(difference_from_zombie) elif action_num == 2: self._attack() elif action_num == 3: self._heal() def _basic_observation_to_array(self, ob): obs_array = [] obs_array.append(ob['TimeAlive']) if 'TimeAlive' in ob else 0 obs_array.append(ob['Life']) if 'Life' in ob else 0 obs_array.append(ob['XPos']) if 'XPos' in ob else 0 obs_array.append(ob['YPos']) if 'YPos' in ob else 0 obs_array.append(ob['ZPos']) if 'ZPos' in ob else 0 return obs_array def _complete_observation_to_array(self, observation): observation.append(self.num_zombies) observation.append(self.turning_diff) return np.array(observation) def _observation_to_array(self, ob): ob = self._basic_observation_to_array(ob) return self._complete_observation_to_array(ob) def _check_all_zombies_dead(self): zombies_alive = False if u'entities' in self.ob: entities = self.ob["entities"] for e in entities: if e["name"] == "Zombie": zombies_alive = True break if zombies_alive == False: print("quitting mission") self.agent_host.sendCommand("chat /kill @e") # parts of direction learner def _findUs(self, entities): if u'entities' in self.ob: for ent in self.ob['entities']: if ent["name"] == 'Zombie': continue else: return ent def _getBestAngle(self, current_yaw): '''Scan through 360 degrees, looking for the best direction in which to take the next step.''' if u'entities' in self.ob: us = self._findUs(self.ob['entities']) # Normalise current yaw: while current_yaw < 0: current_yaw += 360 while current_yaw > 360: current_yaw -= 360 return us, current_yaw def _look_for_best_option(self, us, current_yaw): scores = [] for i in range(self.agent_search_resolution): # Calculate cost of turning: ang = 2 * math.pi * (old_div(i, float( self.agent_search_resolution))) yaw = i * 360.0 / float(self.agent_search_resolution) yawdist = min(abs(yaw - current_yaw), 360 - abs(yaw - current_yaw)) turncost = self.agent_turn_weight * yawdist score = turncost # Calculate entity proximity cost for new (x,z): x = us["x"] + self.agent_stepsize - math.sin(ang) z = us["z"] + self.agent_stepsize * math.cos(ang) if u'entities' in self.ob: for ent in self.ob['entities']: dist = (ent["x"] - x) * (ent["x"] - x) + (ent["z"] - z) * ( ent["z"] - z) if (dist == 0): continue weight = 0.0 if ent["name"] == 'Zombie': weight = self.agent_mob_weight dist -= 1 # assume mobs are moving towards us if dist <= 0: dist = 0.1 score += old_div(weight, float(dist)) scores.append(self._calculate_turning_costs(score, x, z)) scores.append(score) return scores def _calculate_turning_costs(self, score, x, z): # Calculate cost of proximity to edges distRight = (2 + old_div(self.XSize, 2)) - x distLeft = (-2 - old_div(self.XSize, 2)) - x distTop = (2 + old_div(self.ZSize, 2)) - z distBottom = (-2 - old_div(self.ZSize, 2)) - z if distRight > 0: score += old_div( self.agent_edge_weight, float(distRight * distRight * distRight * distRight)) if distLeft > 0: score += old_div(self.agent_edge_weight, float(distLeft * distLeft * distLeft * distLeft)) if distTop > 0: score += old_div(self.agent_edge_weight, float(distTop * distTop * distTop * distTop)) if distBottom > 0: score += old_div( self.agent_edge_weight, float(distBottom * distBottom * distBottom * distBottom)) return score def _find_best_score_get_angle(self, scores): # Find best score: i = scores.index(max(scores)) # Return as an angle in degrees: return i * 360.0 / float(self.agent_search_resolution) def _process_direction(self): us, current_yaw = self._getBestAngle(self.current_yaw) scores = self._look_for_best_option(us, current_yaw) angle = self._find_best_score_get_angle(scores) return angle def _turn(self): if "entities" in self.ob: entities = self.ob["entities"] best_yaw = self._process_direction() difference = best_yaw - self.current_yaw while difference < -180: difference += 360 while difference > 180: difference -= 360 difference /= 180.0 self.agent_host.sendCommand("move 1") self.agent_host.sendCommand("turn " + str(difference)) self.turning_diff = difference # print('turning') else: self.turning_diff = 0 def _plot_dqn_results(self, scores, eps_history, filename='zombie_kill.png', lines=None): x = [i + 1 for i in range(self.n_games)] print("Plotting results...") self._plotLearning(x, scores, eps_history, filename, lines=None) def _plotLearning(self, x, scores, epsilons, filename, lines=None): fig = plt.figure() ax = fig.add_subplot(111, label="1") ax2 = fig.add_subplot(111, label="2", frame_on=False) ax.plot(x, epsilons, color="C0") ax.set_xlabel("Game", color="C0") ax.set_ylabel("Epsilon", color="C0") ax.tick_params(axis='x', colors="C0") ax.tick_params(axis='y', colors="C0") N = len(scores) running_avg = np.empty(N) for t in range(N): running_avg[t] = np.mean(scores[max(0, t - 20):(t + 1)]) ax2.scatter(x, running_avg, color="C1") #ax2.xaxis.tick_top() ax2.axes.get_xaxis().set_visible(False) ax2.yaxis.tick_right() #ax2.set_xlabel('x label 2', color="C1") ax2.set_ylabel('Score', color="C1") #ax2.xaxis.set_label_position('top') ax2.yaxis.set_label_position('right') #ax2.tick_params(axis='x', colors="C1") ax2.tick_params(axis='y', colors="C1") if lines is not None: for line in lines: plt.axvline(x=line) plt.savefig(filename) """Count number of zombies remained under current observation""" def _count_num_of_zombies(self): count = 0 if u'entities' in self.ob: entities = self.ob["entities"] for e in entities: if e["name"] == "Zombie": count += 1 return count def run_dqn(self): for i in range(1, self.n_games + 1): self.agent.tensorboard.step = i self._start_mission() score = 0 done = False self.ob = None self.num_heals = 2 self.agent_host.sendCommand( "chat /effect ZombieKiller strength 500000") while self.world_state.is_mission_running: current_reward = 0 # initialize rewards/penalties self.move_backwards_reward = 0 self.life_decrease_penalty = 0 self.time_rewards = 0 self.heal_rewards = 0 self.world_state = self.agent_host.getWorldState() if self.world_state.number_of_observations_since_last_state > 0: # get observation msg = self.world_state.observations[-1].text self.ob = json.loads(msg) # Check if life is dropped if "Life" in self.ob: life = self.ob[u'Life'] if life < self.current_life: print("aaaaaaaaaaargh!!") self.life_decrease_penalty += life - self.current_life # life decrease penalty self.flash = True self.current_life = life self._get_position_and_orientation() difference = self._calculate_turning_difference_from_zombies( ) # agent chooses action ob_array = self._observation_to_array(self.ob) #print(f'prev_ob: {ob_array}') action = self.agent.choose_action(ob_array) print("action", action) self._translate_actions(action, difference) time.sleep(0.1) #keras calculations observation_ = self._get_next_observation() self._check_num_zombies() new_ob_array = self._observation_to_array(observation_) # print(f'next_ob: {new_ob_array}') current_reward += self._get_current_rewards(current_reward) score += current_reward #self.visual.drawStats(score, self._count_num_of_zombies(), i) self.agent.remember(ob_array, action, current_reward, new_ob_array, done) self.agent.learn(done) # Visualization self.visual.drawMobs(self.ob['entities'], self.flash, score, self._count_num_of_zombies(), i) self.flash = False self._check_all_zombies_dead() elif self.all_zombies_dead == True: self.all_zombies_dead = False self.eps_history.append(self.agent.epsilon) self.scores.append(score) avg_score = np.mean(self.scores[max(0, i - 100):(i + 1)]) print('episode ', i + 1, 'score %.2f' % score, 'average score %.2f' % avg_score) if not i % self.aggregate_episode_every or i == 1: self.agent.tensorboard.update_stats( reward_avg=avg_score, reward_min=np.min(self.scores[max(0, i - 100):(i + 1)]), reward_max=np.max(self.scores[max(0, i - 100):(i + 1)]), epsilon=self.agent.epsilon) print(f"TensorBoard logdir: {self.agent.log_dir}") if i % 10 == 0 and i > 0: self.agent.save_model() print('Saved Model :D') self._plot_dqn_results(self.scores, self.eps_history) def _act(self, world_state, agent_host, current_r): """take 1 action in response to the current world state""" obs_text = world_state.observations[-1].text self.ob = json.loads(obs_text) # most recent observation #self._assign_observation() self.logger.debug(self.ob) if not u'XPos' in self.ob or not u'ZPos' in self.ob: self.logger.error("Incomplete observation received") return 0 current_s = "%d:%d" % (int(self.ob[u'XPos']), int(self.ob[u'ZPos'])) self.logger.debug( "State: %s (x = %.2f, z = %.2f)" % (current_s, float(self.ob[u'XPos']), float(self.ob[u'ZPos']))) if current_s not in self.Qtb: self.Qtb[current_s] = ([0] * self.n_actions) # update Q values if self.prev_s is not None and self.prev_a is not None: self._updateQTable(current_r, current_s) # select the next action rnd = random.random() if rnd < self.epsilon: action = random.randint(0, self.n_actions - 1) else: m = max(self.Qtb[current_s]) self.logger.debug("Current values: %s" % ",".join(str(x) for x in self.Qtb[current_s])) l = list() for x in range(0, self.n_actions): if self.Qtb[current_s][x] == m: l.append(x) y = random.randint(0, len(l) - 1) action = l[y] # try to send the selected action, only update prev_s if this succeeds try: difference = self._calculate_turning_difference_from_zombies() self._translate_actions(action, difference) self.prev_s = current_s self.prev_a = action except RuntimeError as e: self.logger.error("Failed to send command: %s" % e) return current_r def run_qlearning(self): cumulative_rewards = [] for i in range(self.n_games): self._start_mission() total_reward = 0 is_first_action = True self.prev_s = None self.prev_a = None while self.world_state.is_mission_running: current_r = 0 if is_first_action: while True: time.sleep(0.1) self.world_state = self.agent_host.getWorldState() for error in self.world_state.errors: self.logger.error("Error: %s" % error.text) for reward in self.world_state.rewards: current_r += reward.getValue() if self.world_state.is_mission_running and len( self.world_state.observations ) > 0 and not self.world_state.observations[ -1].text == "{}": total_reward += self._act(self.world_state, self.agent_host, current_r) break if not self.world_state.is_mission_running: break is_first_action = False print(f"current reward = {current_r}") else: # wait for non-zero reward while self.world_state.is_mission_running and current_r == 0: time.sleep(0.1) self.world_state = self.agent_host.getWorldState() for error in self.world_state.errors: self.logger.error("Error: %s" % error.text) for reward in self.world_state.rewards: current_r += reward.getValue() #print("waiting to stabilize") # allow time to stabilise after action while True: time.sleep(0.1) self.world_state = self.agent_host.getWorldState() for error in self.world_state.errors: self.logger.error("Error: %s" % error.text) for reward in self.world_state.rewards: current_r += reward.getValue() if self.world_state.is_mission_running and len( self.world_state.observations ) > 0 and not self.world_state.observations[ -1].text == "{}": total_reward += self._act(self.world_state, self.agent_host, current_r) break if not self.world_state.is_mission_running: break self._check_all_zombies_dead() # process final reward self.logger.debug("Final reward: %d" % current_r) print('Cumulative reward: %d' % total_reward) total_reward += current_r # update Q values if self.prev_s is not None and self.prev_a is not None: self._updateQTableFromTerminatingState(current_r) self._exportQTable() # export the Q table after each iteration cumulative_rewards += [total_reward] print("Cumulative rewards for all %d runs:" % self.n_games) print(cumulative_rewards)