def test_PLACE_SHIPS_looks_reasonable_VERTICAL(n=1000): w = 10 h = 10 env = BattleshipEnv(width=w, height=h) for _ in range(n): env.reset(vert_probability=1) seen_in_previous_rows = [] for col in range(w): newly_seen = [] for row in range(h): element = env.state[row][col] if element != 0: if element in seen_in_previous_rows: print( "\nBAD!!! Saw a %s in multiple cols. Showing the offending board:" % element) show_ships(env.state) raise Exception( "BAD!!! Saw a %s in multiple col." % element) else: newly_seen.append(element) seen_in_previous_rows += newly_seen print("\nVERTICAL") show_ships(env.state)
def test_PLACE_SHIPS_looks_reasonable_TOP_RIGHT(n=1000): w = 10 h = 10 cumulative_ships = np.zeros((h, w), dtype=np.int32) env = BattleshipEnv(width=w, height=h) for _ in range(n): env.reset(favor_top=100, favor_right=100) cumulative_ships += env.state > 0 print("\nTOP RIGHT") show_ships(cumulative_ships)
def test_PLACE_SHIPS_looks_reasonable_NO_FAVORS(n=1000): w = 10 h = 10 cumulative_ships = np.zeros((h, w), dtype=np.int32) env = BattleshipEnv(width=w, height=h) for _ in range(n): env.reset() cumulative_ships += env.state > 0 print("\nNO FAVORS") show_ships(cumulative_ships)
def test_PLACE_SHIPS_looks_reasonable_TOP_and_BOTTOM_with_STEEP_GRADIENT(n=1000): w = 10 h = 10 cumulative_ships = np.zeros((h, w), dtype=np.int32) env = BattleshipEnv(width=w, height=h) for _ in range(n): env.reset(favor_top=1, favor_bottom=1, gradient_coef=lambda x: x**10) cumulative_ships += env.state > 0 print("\nTOP and BOTTOM with STEEP GRADIENT") show_ships(cumulative_ships)
def test_PLACE_SHIPS_looks_reasonable_LEFT_with_UNSTEEP_GRADIENT(n=1000): w = 10 h = 10 cumulative_ships = np.zeros((h, w), dtype=np.int32) env = BattleshipEnv(width=w, height=h) for _ in range(n): env.reset(favor_left=10, gradient_coef=lambda x: x**(1/2)) cumulative_ships += env.state > 0 print("\nLEFT with UNSTEEP GRADIENT") show_ships(cumulative_ships)
def test_PLACE_SHIPS_looks_reasonable_CENTER_with_STEEP_GRADIENT(n=1000): """ Note: To get a really defined center, the gradient coeff needs to be something that grows slowly like lambda x: x**(1/10) """ w = 10 h = 10 cumulative_ships = np.zeros((h, w), dtype=np.int32) env = BattleshipEnv(width=w, height=h) for _ in range(n): env.reset(favor_top=-100000, favor_right=-100000, favor_left=- 100000, favor_bottom=-100000, gradient_coef=lambda x: x**(1/1000)) cumulative_ships += env.state > 0 print("\nCENTER with STEEP GRADIENT") show_ships(cumulative_ships)
def evaluate_agents(episodes=100): env = BattleshipEnv() results = pd.DataFrame(columns=[ 'min', 'median', 'max', 'mean', 'std', 'avg_time', 'episodes' ]) print('Agents to be evaluated: {}'.format(list(AGENT_DICT.keys()))) for agent_name in AGENT_DICT: agent = AGENT_DICT[agent_name]() shots_fired_totals = [] start = time.time() for _ in tqdm.tqdm(range(episodes), desc='Evaluating {} agent'.format(agent_name, )): agent.reset() obs = env.reset() done = False total_reward = 0 shots_fired = 0 while not done: action = agent.select_action(obs) obs, reward, done, info = env.step(action) total_reward += reward shots_fired += 1 shots_fired_totals.append(shots_fired) # compute statistics avg_time = (time.time() - start) / episodes min_sf = min(shots_fired_totals) max_sf = max(shots_fired_totals) median_sf = statistics.median(shots_fired_totals) mean_sf = statistics.mean(shots_fired_totals) std_sf = statistics.stdev(shots_fired_totals) agent_results = pd.Series(data={ 'min': min_sf, 'median': median_sf, 'max': max_sf, 'mean': mean_sf, 'std': std_sf, 'avg_time': avg_time, 'episodes': episodes }, name=agent_name) results = results.append(agent_results) print(results)
def test_generate_particle_WHEN_board_is_valid(n=1000): for check in range(n): if check % 10 == 0: print(check) # Play games up to turn_limit until you get one that isnt done done = True while done: # We're gonna make a real game and run it for some number of turns, take the observation and check a particle env = BattleshipEnv() agent = RandomBattleshipAgent(delay=0) obs = env.reset() turn = 0 turn_limit = random.randrange(0, 100) done = False while not done and turn <= turn_limit: action = agent.select_action(obs) obs, reward, done, info = env.step(action) turn += 1 # IMPORTANT: particle will have 0s where there is no ship, and some positive number where there is a ship particle = generate_particle(obs, agent.unsunk_ships) for i in range(len(obs)): for j in range(len(obs[i])): cell = obs[i][j] if (cell == HIT or cell == SUNK) and particle[i][j] == 0: raise Exception( 'The particle is missing a ship on (%s, %s)' % (i, j)) if cell == MISS and particle[i][j] != 0: raise Exception( 'The particle should not have a ship on (%s, %s)' % (i, j)) print("Found %s particles successfully!" % n)
def basic_example(): delay = .1 env = BattleshipEnv() obs = env.reset() # agent = RandomWithPreferredActionsAgent(delay=delay) agent = ProbabilisticAgent(delay=delay) done = False total_reward = 0 while not done: action = agent.select_action(obs) obs, reward, done, info = env.step(action) total_reward += reward env.render() shots_fired = -total_reward print('{} shots fired'.format(shots_fired))
def Train(board_dimension=BOARD_DIMENSION, hidden_size=HIDDEN_SIZE, params=DEFAULT_PARAMS): q_function = Q_function_CNN(board_dimension=board_dimension, hidden_size=HIDDEN_SIZE) q_function.to(device) env = BattleshipEnv() agent = Q_learning_agent(q_function=q_function, params=params, board_dimension=board_dimension) epsilon_func = lambda a: np.float64(max(1 - a * .90 / 1000000, .1)) num_training_episodes = 50000 global_step = 0 game_steps_list = list() num_of_shots_history = list() num_updates = 1 C = 10000 agent.update_target_network() test_history = list() for episode_num in range(1, num_training_episodes): agent.reset() obs = env.reset() done = False total_reward = 0 agent.params['epsilon'] = epsilon_func(num_updates) test = False game_steps = 0 while not done: old_obs = obs action = agent.select_action(obs, env) obs, reward, done, info = env.step(action) try: #breakpoint() agent.add_experience([old_obs, action, reward, obs]) except: breakpoint() agent.add_experience([old_obs, action, reward, obs]) if len(agent.experience['states']) > 500: agent.train_batch() num_updates += 1 if num_updates % C == 0: agent.update_target_network() total_reward += reward game_steps += 1 global_step += 1 #breakpoint() if episode_num % 50 == 0: test = True num_of_shots_history.append(game_steps) if len(game_steps_list) >= 100: game_steps_list.pop(0) game_steps_list.append(game_steps) else: game_steps_list.append(game_steps) average_shots = np.array(game_steps_list).mean() if episode_num % 10 == 0: print( 'EPISODE NUM: {0}, avg # of shots (last 100 games) {1} \t\t\t epsilon: {2}' .format(episode_num, average_shots.round(2), agent.params['epsilon'].round(4))) pass if test: avg_test_shots = agent.evaluate(agent, env, episode_num, average_shots, agent.params['test_episodes']) test_history.append(avg_test_shots) #Train(10,512)
def train_q_agent(board_dimension=BOARD_DIMENSION, hidden_size=HIDDEN_SIZE, params=DEFAULT_PARAMS, ships=DEFAULT_SHIPS, use_canon=False, save_name='test_save.pt'): q_function = Q_function_CNN(board_dimension=board_dimension, hidden_size=hidden_size) q_function.to(device) env = BattleshipEnv(board_dimension, board_dimension, ships=ships) agent = Q_learning_agent(q_function=q_function, params=params, board_dimension=board_dimension) epsilon_func = lambda a: np.float64(max(1 - a * .90 / 200000, .1)) num_training_episodes = params['num_episodes'] global_step = 0 game_steps_list = list() num_of_shots_history = list() num_updates = 1 agent.update_target_network() test_history = list() performance_dict = {'reward_hist': list(), 'avg_shot_hist': list()} if use_canon: canon = Canonicalizer(board_dimension) else: canon = None for episode_num in range(1, num_training_episodes): agent.reset() obs = env.reset() if use_canon: c_obs = canon.canon_obs(obs) done = False total_reward = 0 agent.params['epsilon'] = epsilon_func(num_updates) test = False game_steps = 0 while not done: if use_canon: old_c_obs = c_obs c_obs, c_tuple = canon.canon_obs(obs, return_tuple=True) c_action = agent.select_action(c_obs) action = canon.uncanon_action(c_action, c_tuple) else: old_obs = obs action = agent.select_action(obs) obs, reward, done, info = env.step(action) if done: game_steps += 1 global_step += 1 total_reward += reward break if use_canon: agent.add_experience([old_c_obs, c_action, reward, c_obs]) else: agent.add_experience([old_obs, action, reward, obs]) if len(agent.experience['states']) > 500: agent.train_batch_base() num_updates += 1 if num_updates % agent.params['C'] == 0: agent.update_target_network() total_reward += reward game_steps += 1 global_step += 1 #breakpoint() if episode_num % 50 == 0: test = True num_of_shots_history.append(game_steps) if len(game_steps_list) >= 100: game_steps_list.pop(0) game_steps_list.append(game_steps) else: game_steps_list.append(game_steps) average_shots = np.array(game_steps_list).mean() if episode_num % 10 == 0: print( 'EPISODE NUM: {0}, avg # of shots (last 100 games) {1} \t\t\t epsilon: {2}' .format(episode_num, average_shots.round(2), agent.params['epsilon'].round(4))) pass if test: avg_test_shots, avg_reward = agent.evaluate( env, agent.params['test_episodes'], canon=canon) performance_dict['reward_hist'].append(avg_reward) performance_dict['avg_shot_hist'].append(avg_test_shots) test_history.append(avg_test_shots) if use_canon: log_fname = '{0}x{0}_hidden{1}_canon_log.json'.format( board_dimension, hidden_size) else: log_fname = '{0}x{0}_hidden{1}_log.json'.format( board_dimension, hidden_size) with open(log_fname, "w") as f: json.dump(performance_dict, f) if use_canon: log_fname = '{0}x{0}_hidden{1}_canon_log.json'.format( board_dimension, hidden_size) else: log_fname = '{0}x{0}_hidden{1}_log.json'.format( board_dimension, hidden_size) with open(log_fname, "w") as f: json.dump(performance_dict, f) if use_canon: agent.save_agent('{0}x{0}_hidden{1}_episodes{2}_canon_model.pt'.format( board_dimension, hidden_size, num_training_episodes)) else: agent.save_agent('{0}x{0}_hidden{1}_episodes{2}_model.pt'.format( board_dimension, hidden_size, num_training_episodes))
return loss.detach().item() # ============================================================================= # class Q_data(Dataset): # def __init__(self,experience): # self.experience=experience # # def __len__(self): # return len(self.experience) # def __getitem__(self,index): # # return None # ============================================================================= q_function=Q_function_FC(BOARD_DIMENSION**2,BOARD_DIMENSION**2) env=BattleshipEnv() params={'alpha':.001,'epsilon':1,'epsilon_decay':.99,'epsilon_length':1000,'replay_size':50000,'gamma':.99, 'criterion':nn.MSELoss(),'optim': torch.optim.Adam,'num_of_Q_epochs':2} agent=Q_learning_agent(q_function=q_function,params=params) epsilon_func=lambda a: np.exp(-a/30000)+.1 num_training_episodes=50000 shots_fired_totals = list() global_step=0 game_steps_list=list() num_of_shots_history=list() #plt.figure() breakpoint()