def run_game(): env = Tetris() episodes = 2000 max_steps = None discount = 0.98 replay_mem_size = 20000 minibatch_size = 512 epsilon = 1 epsilon_min = 0 epsilon_stop_episode = 1500 learning_rate = 5e-3 epochs = 1 show_every = 50 log_every = 50 replay_start_size = 2000 train_every = 1 hidden_dims = [64, 64] activations = ['relu', 'relu', 'linear'] agent = DQNAgent(env.get_state_size(), discount=discount, \ replay_mem_size=replay_mem_size, \ minibatch_size=minibatch_size, epsilon=epsilon, \ # epsilon_decay=epsilon_decay, \ epsilon_min=epsilon_min, \ epsilon_stop_episode=epsilon_stop_episode, \ learning_rate=learning_rate, hidden_dims=hidden_dims, \ activations=activations, \ replay_start_size=replay_start_size) log_dir = f'log/tetris-{datetime.now().strftime("%Y%m%d-%H%M%S")}-nn={str(hidden_dims)}-mem={replay_mem_size}-bs={minibatch_size}-discount={discount}' log = ModifiedTensorBoard(log_dir=log_dir) scores = [] for episode in tqdm(range(episodes)): current_state = env.reset_game() done = False step = 0 log.step = episode if show_every and episode % show_every == 0: show = True else: show = False # Run the game until either game over or we've hit max number of steps while not done and (not max_steps or step < max_steps): next_states = env.get_next_states() best_state = agent.best_state(next_states.values()) best_action = None # action is (x,i), state is [lines_cleared, holes, total_bumpiness, sum_height] for action, state in next_states.items(): if state == best_state: best_action = action break # reward is the score, done is gameover status reward, done = env.play_game(best_action[0], best_action[1], show=show) if show: env.show() agent.update_replay_memory(current_state, best_action, next_states[best_action], reward, done) # move to next timestep current_state = next_states[best_action] step += 1 if show: # After game is completed, collect the final score print("Episode %d score: %d epsilon: %.2f" % (episode, env.get_game_score(), agent.epsilon)) scores.append(env.get_game_score()) agent.train(epochs=epochs) if log_every and episode % log_every == 0: avg_score = mean(scores[-log_every:]) min_score = min(scores[-log_every:]) max_score = max(scores[-log_every:]) log.update_stats(avg_score=avg_score, min_score=min_score, max_score=max_score) if env.get_game_score() >= MIN_SCORE: if not os.path.exists('models/'): os.makedirs('models/') agent.model.save( f'models/eps_{str(episode)}nn_{str(hidden_dims)}__bs{minibatch_size}__score_{env.get_game_score()}__{int(time.time())}.h5' )
action = np.argmax(agent.get_qs(current_state)) else: # Get random action action = np.random.randint(0, env.ACTION_SPACE_SIZE) new_state, reward, done = env.step(action) # Transform new continous state to new discrete state and count reward episode_reward += reward if SHOW_PREVIEW:# and not episode % AGGREGATE_STATS_EVERY: env.render() # Every step we update replay memory and train main network agent.update_replay_memory((current_state, action, reward, new_state, done)) agent.train(done, step) current_state = new_state step += 1 # Append episode reward to a list and log stats (every given number of episodes) ep_rewards.append(episode_reward) if not episode % AGGREGATE_STATS_EVERY or episode == 1: average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:]) min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:]) max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:]) agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon) # Save model, but only when min reward is greater or equal a set value if episode % 50 == 0: agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
env = FireResetEnv(env) env = ProcessFrame84(env) env = FrameStack(env, 4) env = ClippedRewardsWrapper(env) return env env = wrap_dqn(gym.make('PongNoFrameskip-v4')) agent = DQNAgent(env=env, num_actions=NUM_ACTIONS, lr=LR, discount=GAMMA) # Load model # agent.load_model(weights_file="snaps/model") # Train agent agent.train(TRAIN_STEPS, weights_file="snaps/model") # Evaluate success = 0 for tr in range(TRIALS): state = env.reset() t = 0 acc_r = 0 while True: env.render() action = agent.act(state) state, reward, done, _ = env.step(action) acc_r += reward t += 1 if done: print("Trial {} finished after {} timesteps".format(tr, t))
"epsilon_min": epsilon_min, "batch_size": batch_size, "memory_size": memory_size, "name": name }, name=name, allow_val_change=True) # Utilize the hyperparameters of the model like this: config.parameter config = wandb.config model = DQNAgent(env, config, epsilon, training_episodes, testing_episodes, frames) hyper_param_counter += 1 model.train() print("Run {} of {}.".format(hyper_param_counter, total_runs)) model_dir = "saved_models" model_save_name = model_dir + "LR_{}_LS_{}_BS_{}_MS_{}_Timestamp_{}".format( learning_rate, layer_size, batch_size, memory_size, int( time.time())) + "sb.h5" model.save(model_save_name) #--------------------------------------------------------------------------------------- # Model previously finished in 242 Episodes learning_rate = 0.001 layer_size = 256 batch_size = 64 memory_size = 50_000
premio = 0 informacion = env.get_info() antiguo_statistics = informacion['statistics'] board = env.board() mejor_estado = board_prop(board)[:] if lineas_completadas < informacion[ 'number_of_lines'] and not terminado: premio = premio + 40 * (-lineas_completadas + informacion['number_of_lines']) lineas_completadas = informacion['number_of_lines'] state, reward, terminado, info = env.step(0) agent.add_memoria(estado, mejor_estado, premio, terminado) agent.add_memoria(estado, mejor_estado, premio, terminado) estado = mejor_estado[:] pieza_colocada = True eliminado = False puntuacion = informacion['score'] file.write(str(puntuacion) + ",") if puntuacion > puntuacion_max: puntuacion_max = puntuacion agent.modelo.save('modelo_max.modelo') if episodio % entrenar_cada == 0: agent.train(batch_size=128, epochs=1) file.close() env.close()
informacion['number_of_lines']) lineas_completadas = informacion['number_of_lines'] state, reward, terminado, info = env.step(0) agent.add_memoria(estado, mejor_estado, premio, terminado) #movimientos.append([estado,mejor_estado,premio,terminado,valor]) agent.add_memoria(estado, mejor_estado, premio, terminado) #ant_casillas_para_completar=casillas_para_completar #movimientos.append([estado,mejor_estado,premio,terminado,valor]) estado = mejor_estado[:] pieza_colocada = True puntuacion = informacion['score'] file.write(str(puntuacion) + ",") #if puntuacion>puntuacion_max*0.5: if puntuacion > puntuacion_max: puntuacion_max = puntuacion agent.modelo.save('modelo_max.modelo') # Entrenamos al agente con los datos obtenidos de la partida if episodio % entrenar_cada == 0: agent.train(batch_size=500, epochs=1, puntuacion=lineas_completadas, q_actual=valor) fin = time.perf_counter() print(fin - inicio) file.close() env.close()