def evaluate(agent_profile, agent_new_path, agent_old_path, games_num, experience_path=None, acceptance_rate=0.6, verbose=True, debug=False, max_steps=None, self_play_examples_deque=deque([])): print("Evaluating model with games_num %d and acceptance_rate %f" % (games_num, acceptance_rate)) env_selector = EnvironmentSelector() agent = env_selector.get_agent(agent_profile) agent.set_exploration_enabled(False) agent_profile = env_selector.get_profile(agent_profile) game = env_selector.get_game(agent_profile.game) agents = [] for idx in range(game.get_players_num()): old_agent = agent.clone() old_agent.load(agent_old_path) agents.append(old_agent) agent.load(agent_new_path) agents[0] = agent arena_games_results = [0] * len(agents) arena_examples = [] arena_games_n = int(games_num / game.get_players_num()) world = World() for jdx in range(game.get_players_num()): playing_agents = shift_list(agents, jdx) sess_arena_examples, games_results = world.execute_games(playing_agents, game, arena_games_n, max_game_steps_n=max_steps, verbose=verbose, show_every_turn=debug) games_results = shift_list(games_results, -jdx) for index in range(len(arena_games_results)): arena_games_results[index] += games_results[index] arena_examples.extend(sess_arena_examples) self_play_examples_deque += arena_examples if experience_path: serialize(self_play_examples_deque, experience_path) cur_rewards = arena_games_results[0] other_rewards = sum(arena_games_results) - cur_rewards print("Current agent got rewards: %d\n" "Total reward across all other agents: %d" % (cur_rewards, other_rewards)) updated = (cur_rewards > other_rewards) >= acceptance_rate return updated
def generate_self_play(opt_agent_profile, agent_path, games_num, experience_path, max_steps, verbose, debug, exploration_decay_steps, optimize_for_inference=False, self_play_examples_deque=deque([])): world = World() env_selector = EnvironmentSelector() agent = env_selector.get_agent(opt_agent_profile) agent.load(agent_path) agent_profile = env_selector.get_profile(opt_agent_profile) game = env_selector.get_game(agent_profile.game) if optimize_for_inference: agent.disable_training_capability() self_play_examples = world.generate_self_play( agent, game, games_num, max_game_steps_n=max_steps, verbose=verbose, show_every_turn=debug, exploration_decay_steps=exploration_decay_steps) self_play_examples_deque += self_play_examples serialize(self_play_examples_deque, experience_path)
def fuse_memory(old_memory_path, new_memory_path, out_memory_path): if os.path.isfile(old_memory_path) and os.path.isfile(new_memory_path): try: serialize( deserialize(new_memory_path) + deserialize(old_memory_path), out_memory_path) except: print("Could not deserialize new + old. Try reverse order") serialize( deserialize(old_memory_path) + deserialize(new_memory_path), out_memory_path) elif os.path.isfile(new_memory_path): serialize(deserialize(new_memory_path), out_memory_path)
if n_memory != 0: print('Deserializing memory from %s' % memory) des_mem = deserialize(memory) print(type(des_mem)) # serialize(des_mem, train_memory_file) if n_memory == -1 or n_memory > len(memories): for file in memories: print('Deserializing memory from %s' % file) des_mem.extend(deserialize(file)) # fuse_memory(train_memory_file, file, train_memory_file) elif n_memory > 0: for file in memories[-n_memory:]: print('Deserializing memory from %s' % file) des_mem.extend(deserialize(file)) # fuse_memory(train_memory_file, file, train_memory_file) serialize(des_mem, train_memory_file) else: train_memory_file = memory memories.append(memory) # train with selected memory new_agent_path = workspace + '/model_updated_%d.h5' % i train(options.agent_profile, train_memory_file, cur_agent_path, new_agent_path, train_distributed=options.train_distributed, train_distributed_native=options.train_distributed_native, epochs=options.epochs)