def run_eval(dir_name: str, episodes: int = 100, render: bool = False) -> List[int]: agent_conf = AgentConf() env = Tetris() agent = DQNAgent(env.get_state_size(), n_neurons=agent_conf.n_neurons, activations=agent_conf.activations, epsilon_stop_episode=agent_conf.epsilon_stop_episode, mem_size=agent_conf.mem_size, discount=agent_conf.discount, replay_start_size=agent_conf.replay_start_size) # timestamp_str = "20190730-165821" # log_dir = f'logs/tetris-nn={str(agent_conf.n_neurons)}-mem={agent_conf.mem_size}' \ # f'-bs={agent_conf.batch_size}-e={agent_conf.epochs}-{timestamp_str}' # tetris-20190731-221411-nn=[32, 32]-mem=25000-bs=512-e=1 good log_dir = 'logs/' + dir_name # load_model agent.model = load_model(f'{log_dir}/model.hdf') agent.epsilon = 0 scores = [] for episode in range(episodes): env.reset() done = False while not done: next_states = env.get_next_states() best_state = agent.best_state(next_states.values()) # find the action, that corresponds to the best state best_action = None for action, state in next_states.items(): if state == best_state: best_action = action break _, done = env.hard_drop([best_action[0], 0], best_action[1], render=render) scores.append(env.score) # print results at the end of the episode print(f'episode {episode} => {env.score}') return scores
state_size = env.OBSERVATION_SPACE_VALUES action_size = env.ACTION_SPACE_SIZE agent = DQNAgent(state_size, action_size) done = False batch_size = 64 best_score = -1 render = False if PLAY: play_game(agent, False) else: if LOAD_MODEL: print("Loading Model...") agent.load(load_file_name) agent.epsilon = 1.0 agent.learning_rate = 0.001 agent.epsilon_decay = 0.990 agent.gamma = 0.95 for phase in range(7, 8): env.number_of_grids = phase + 3 agent.epsilon = 1.0 agent.memory = deque(maxlen=2000) phase_scores = deque(maxlen=5) for e in range(EPISODES): done = False state = env.reset() # env.seed(0) state = np.reshape(state, [1, state_size]) score = 0
def dqn(): episodes = 10000 max_steps = None epsilon_stop_episode = 7000 mem_size = 20000 discount = 0.95 batch_size = 512 epochs = 1 render_every = 1000 log_every = 20 replay_start_size = 2000 train_every = 1 n_neurons = [32, 32] render_delay = None activations = ['relu', 'relu', 'linear'] env = Tetris() ''' with open(r"saved_agents/pickled_new_agent_10000_7000", "rb") as input_file: agent = pickle.load(input_file) agent.epsilon = 0 ''' agent = DQNAgent(env.get_state_size(), n_neurons=n_neurons, activations=activations, epsilon_stop_episode=epsilon_stop_episode, mem_size=mem_size, discount=discount, replay_start_size=replay_start_size) agent.epsilon = 0 ''' hateris = DQNAgent(env.get_state_size()+1, n_neurons=n_neurons, activations=activations, epsilon_stop_episode=epsilon_stop_episode, mem_size=mem_size, discount=discount, replay_start_size=replay_start_size) #env.hater = hateris ''' log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}' log = CustomTensorBoard(log_dir=log_dir) scores = [] tot_max_score = 0 for episode in tqdm(range(episodes)): current_state = env.reset() done = False steps = 0 if render_every and episode % render_every == 0: render = True else: render = False # Game while not done and (not max_steps or steps < max_steps): next_states = env.get_next_states(env.current_piece) best_state = agent.best_state(next_states.values()) best_action = None for action, state in next_states.items(): if state == best_state: best_action = action break reward, done = env.play(best_action[0], best_action[1], render=render, render_delay=render_delay) #agent.add_to_memory(current_state, next_states[best_action], reward, done) #hateris.add_to_memory(current_state+[env.current_piece], next_states[best_action]+[env.current_piece], -reward, done) current_state = next_states[best_action] steps += 1 scores.append(env.get_game_score()) # Train #if episode % train_every == 0: #agent.train(batch_size=batch_size, epochs=epochs) #hateris.train(batch_size=batch_size, epochs=epochs) # Logs #if log_every and episode and episode % log_every == 0 and episode>101: if log_every and episode and episode % log_every == 0: avg_score = mean(scores[-log_every:]) min_score = min(scores[-log_every:]) max_score = max(scores[-log_every:]) print( str(episode) + " " + str(avg_score) + " " + str(min_score) + " " + str(max_score)) '''if (tot_max_score < max_score): agent.save("dqnAgentMax10000.h5", episode) tot_max_score = max_score''' #agent.save("dqnAgent10000.h5", episode) # with open("saved_agents/pickled_new_agent_10000_7000", "wb") as input_file: #pickle.dump(agent,input_file) plt.plot(scores) plt.show()
' || average-score %.2f' % avg_score, 'max-score %.2f' % max_score, ' || epsilon %.2f' % agent_dqn.epsilon, 'num_steps', num_steps) if score > max_score: max_score = score # Possible improvemen - short moving max_score average agent_dqn.save_network_checkpoints() print("Training done!") plot_metrics(steps_arr, score_history, eps_history, figure_file_name) # Capturing the agent playing if create_capture: img_arr = [] agent_dqn.epsilon = 0.0001 observation = env.reset() for i in range(1000): frame = np.uint8(observation[0] * 255) frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR) img_arr.append(frame) action = agent_dqn.choose_action(observation) observation, _, _, _ = env.step(action) out = cv2.VideoWriter('captures/' + agent_dqn.env_name + '.avi', cv2.VideoWriter_fourcc(*'DIVX'), 25, (84, 84)) for i in range(len(img_arr)): out.write(img_arr[i])
model.add(Dense(8, activation="relu", name="layer2")) model.add(Dense(action_size, activation="linear", name="layer3")) model.compile(loss='mse', optimizer=Adam(lr=learning_rate)) model.summary() # model.add(Dense(8, input_dim=state_size, activation='relu')) # model.add(Dense(16, activation='relu')) # model.add(Dense(32, activation='relu')) # model.add(Dense(action_size, activation='linear')) # model.compile(loss='mse', # optimizer=Adam(lr=learning_rate)) agent = DQNAgent(state_size, action_size, model) agent.epsilon = 0.75 done = False batch_size = 64 EPISODES = 100 counter = 0 total_reward = 0 # env.turn_off_display() for e in range(EPISODES): # if e == 2: # env.turn_on_display() summary = []
def main(model=None, mode='train', start_episode=0): my_xml = '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <About> <Summary>Hill Descent.</Summary> </About> <ModSettings> <MsPerTick>20</MsPerTick> </ModSettings> <ServerSection> <ServerInitialConditions> <Time><StartTime>1</StartTime></Time> </ServerInitialConditions> <ServerHandlers> <DefaultWorldGenerator seed="-999595225643433963" forceReset="false" destroyAfterUse="false" /> <ServerQuitFromTimeUp timeLimitMs="100000000"/> <ServerQuitWhenAnyAgentFinishes/> </ServerHandlers> </ServerSection> <AgentSection mode="Survival"> <Name>Bob</Name> <AgentStart> <Placement x="28.5" y="87" z="330.5" pitch="-90" yaw="0"/> </AgentStart> <AgentHandlers> <DiscreteMovementCommands/> <MissionQuitCommands quitDescription="done"/> <ChatCommands/> <ObservationFromFullStats/> <ObservationFromGrid> <Grid name="sight"> <min x="{}" y="{}" z="{}"/> <max x="{}" y="{}" z="{}"/> </Grid> <Grid name="feet"> <min x="0" y="-1" z="0"/> <max x="0" y="-1" z="0"/> </Grid> </ObservationsationFromGrid> <AgentQuitFromTouchingBlockType> <Block type="cobblestone" /> </AgentQuitFromTouchingBlockType> </AgentHandlers> </AgentSection> </Mission> '''.format(-(grid_width - 1) // 2, -grid_height, -(grid_width - 1) // 2, (grid_width - 1) // 2, grid_height, (grid_width - 1) // 2) batch_size = 100 agent = DQNAgent(state_size, action_size, learning_rate, discount_rate, epsilon, epsilon_min, epsilon_decay) if model != None: agent.load(model) if mode == 'test': agent.epsilon = 0.0 print('loaded model: {}'.format(model)) else: clear_csv('./data/results.csv') clear_csv('./data/moves.csv') my_client_pool = MalmoPython.ClientPool() my_client_pool.add(MalmoPython.ClientInfo("127.0.0.1", 10001)) agent_host = MalmoPython.AgentHost() for e in range(start_episode + 1, episodes + 1): my_mission = MalmoPython.MissionSpec(my_xml, True) my_mission_record = MalmoPython.MissionRecordSpec() my_mission.requestVideo(800, 500) my_mission.setViewpoint(2) print("Waiting for the mission to start", end=' ') agent_host.startMission( my_mission, my_mission_record, ) world_state = agent_host.getWorldState() while not world_state.has_mission_begun: print(".", end="") time.sleep(0.1) world_state = agent_host.getWorldState() for error in world_state.errors: print("Error:", error.text) print() agent_host.sendCommand('chat /kill @e[type=Chicken]') agent_host.sendCommand('chat /kill @e[type=Pig]') agent_host.sendCommand('chat /kill @e[type=Cow]') moves = 0 episode_reward = 0 while world_state.is_mission_running: world_state = agent_host.getWorldState() if world_state.number_of_observations_since_last_state > 0: try: obvsText = world_state.observations[-1].text data = json.loads(obvsText) except: print("Error when getting state") continue state = get_state(data) prev_x = data.get(u'XPos', 0) prev_y = data.get(u'YPos', 0) prev_z = data.get(u'ZPos', 0) useful_state = [state[2], state[6], state[7], state[8], \ state[10], state[11], state[13], \ state[14], state[16], state[17], \ state[18], state[22]] action = agent.act(useful_state) if ((action == 0 and state[grid_center - grid_width] == 0) or (action == 1 and state[grid_center + 1] == 0) or (action == 2 and state[grid_center + grid_width] == 0) or (action == 3 and state[grid_center - 1] == 0)): agent_host.sendCommand(jump_directions[action]) else: agent_host.sendCommand(directions[action]) time.sleep(0.25) #print("North:", state[grid_center - grid_width], \ # " East:", state[grid_center + 1], \ # " South:", state[grid_center + grid_width], \ # " West:", state[grid_center - 1]) try: world_state = wait_world_state(agent_host, world_state) obvsText = world_state.observations[-1].text data = json.loads(obvsText) except: print("Error when getting state") continue current_x = data.get(u'XPos', 0) current_y = data.get(u'YPos', 0) current_z = data.get(u'ZPos', 0) damage_taken = calculate_damage(prev_y, current_y) next_state = get_state(data) useful_next_state = [state[2], state[6], state[7], state[8], \ state[10], state[11], state[13], \ state[14], state[16], state[17], \ state[18], state[22]] # print("previous and current y", prev_y, current_y) # print("damage taken", damage_taken) #print("X:", prev_x, current_x, "\n", \ # "Y:", prev_y, current_y, "\n", \ # "Z:", prev_z, current_z, "\n") reward = 2 * ( prev_y - current_y ) - 50 * damage_taken - 1 if prev_x != current_x or prev_y != current_y or prev_z != current_z else -1000 episode_reward += reward done = True if current_y <= goal_height or not world_state.is_mission_running or data[ 'Life'] <= 0 else False agent.remember(useful_state, action, reward, useful_next_state, done) if ((action == 0 and state[grid_center - grid_width] == 0) or (action == 1 and state[grid_center + 1] == 0) or (action == 2 and state[grid_center + grid_width] == 0) or (action == 3 and state[grid_center - 1] == 0)): print( 'episode {}/{}, action: {}, reward: {}, e: {:.2}, move: {}, done: {}' .format(e, episodes, jump_directions[action], reward, agent.epsilon, moves, done)) else: print( 'episode {}/{}, action: {}, reward: {}, e: {:.2}, move: {}, done: {}' .format(e, episodes, directions[action], reward, agent.epsilon, moves, done)) moves += 1 if mode == 'train' or model == None: write_to_csv('./data/moves.csv', [e, current_x, current_y, current_z, reward]) if e > batch_size: agent.replay(batch_size) if done or moves > max_moves: agent_host.sendCommand("quit") if (mode == 'train' or model == None) and (e in checkpoints or agent.epsilon <= epsilon_min): print('saving model at episode {}'.format(e)) agent.save('./models/model_{}'.format(e)) if agent.epsilon <= epsilon_min: break time.sleep(1) # my_mission.forceWorldReset() if mode == 'train' or model == None: write_to_csv('./data/results.csv', [e, episode_reward, moves, int(episode_reward > 0)])