def __init__(self, flags): """Initialize runner.""" self.flags = flags self.agent_config = {'players': flags['players']} self.environment = rl_env.make('Hanabi-Full', num_players=flags['players']) self.agent_class = AGENT_CLASSES[flags['agent_class']]
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.agent_config = {'players': flags['players']} self.environment = rl_env.make('Hanabi-Full', num_players=flags['players']) self.agent_class = AGENT_CLASSES[flags['agent_class']] self.game_state_wrappers = list() self.v = vec.ObservationVectorizer(self.environment)
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.env = rl_env.make('Hanabi-Full', num_players=flags['players']) self.agent_config = { 'players': flags['players'], 'num_moves': self.env.num_moves(), 'observation_size': self.env.vectorized_observation_shape()[0] } self.agent_class = AGENT_CLASSES[flags['agent_class']]
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.agent_config = { 'players': flags['players'], 'player_id': 0, 'mcts_types': flags['mcts_types'] } self.environment = make('Hanabi-Full', num_players=flags['players']) self.agent_classes = [ AGENT_CLASSES[agent_class] for agent_class in flags['agent_classes'] ]
def create_environment(game_type='Hanabi-Small', num_players=4): """Creates the Hanabi environment. Args: game_type: Type of game to play. Currently the following are supported: Hanabi-Full: Regular game. Hanabi-Small: The small version of Hanabi, with 2 cards and 2 colours. num_players: Int, number of players to play this game. Returns: A Hanabi environment. """ return rl_env.make( environment_name=game_type, num_players=num_players, pyhanabi_path=None)
def run(ix, initialize=False): # initialize env env = rl_env.make('Hanabi-Full', num_players=flags['players']) agent_config = { 'players': flags['players'], 'num_moves': env.num_moves(), 'observation_size': env.vectorized_observation_shape()[0], 'model_name': str(ix), 'initialize': initialize } agent = NeuroEvoAgent(agent_config) avg_reward = 0 avg_steps = 0 for eps in range(flags['num_episodes']): obs = env.reset() # Observation of all players done = False agent_id = 0 while not done: ob = obs['player_observations'][agent_id] try: action = agent.act(ob) except ValueError: print('Something went wrong. Try to reinitialize the agents' 'pool by using --initialize True') exit() obs, reward, done, _ = env.step(action) avg_reward += reward avg_steps += 1 if done: break # change player agent_id = (agent_id + 1) % flags['players'] n_eps = float(flags['num_episodes']) avg_steps /= n_eps avg_reward /= n_eps agent.save(model_name=str(ix)) scores[ix] = avg_reward * 1000 + avg_steps
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.env = rl_env.make('Hanabi-Full', num_players=flags['players']) # create configurations self.agent_config, self.agent_2_config = self.generate_config(flags) # use configurations to create agent self.agent = load_agent(flags['agent_class'])(self.agent_config) if flags['agent2_class'] != flags['agent_class']: # use configurations to create second agent self.agent2 = load_agent(flags['agent2_class'])( self.agent_2_config)
def __init__(self, args, game_type='Hanabi-Full', num_players=2, num_unique_agents=6, num_games=10): # Changed from None to 10 self.game_type = game_type self.num_players = num_players self.num_unique_agents = num_unique_agents self.num_games = num_games self.environment = rl_env.make(game_type, num_players=self.num_players) self.agent_config = { 'players': self.num_players, 'num_moves': self.environment.num_moves(), 'observation_size': self.environment.vectorized_observation_shape()[0] } self.available_agents = import_agents(args.agentdir, num_unique_agents, self.agent_config)
def __init__(self, flags): """Initialize runner.""" self.flags = flags self.agent_config = {'players': flags['players']} self.environment = rl_env.make('Hanabi-Full', num_players=flags['players']) self.agent_class = [ SimpleAgent, RandomAgent, LossAverseAgent, lambda config: create_tf_agent( self.environment, 'Rainbow', 'agents/rainbow/tmp/hanabi_rainbow/checkpoints'), lambda config: create_tf_agent( self.environment, 'DQN', 'agents/rainbow/tmp/hanabi_dqn/checkpoints'), lambda config: create_tf_agent(self.environment, 'Rainbow', 'agents/rainbow/tmp/pretrained/'), HeuristicAgent, ]
def __init__(self, agent_class, numAgents=-1, load=False, size=1000000): """ Args: agent_class (string): the class of the agent, which can be one of: - 'SimpleAgent' - 'RainbowAgent' - 'RandomAgent' numAgents (int, optional): the number of agents load (boolean, optional): whether we have to load possible existent data of the given class of agents. size (int, optional): how many steps are going to be saved, default is 100K. This size is used to allocate memory at the beginning """ self.size = size self.ptr = 0 self.ep_start_id = self.ptr self.full = False self.path = os.path.join(self.path, agent_class) if not load and numAgents == -1: print( "Bad parameter initialization. Use either 'numAgents' or 'load' to initialize the object." ) exit() else: if load: # load the configurations from file self.config = pickle.load( open(os.path.join(self.path, "config.pickle"), "rb")) numAgents = self.config["numAgents"] else: self.config = {} # create empty dict self.config["numAgents"] = numAgents # insert config data try: # detect the size of the observations env = rl_env.make(num_players=numAgents) obs = env.reset() self.config["size_obs"] = len( obs['player_observations'][0]['vectorized']) # detect the size of move self.n_moves = env.num_moves() # initialize matrices for all values self.moves = np.empty(size, dtype=np.uint8) self.rs = np.empty(size) self.obs = np.empty((size, self.config["size_obs"]), dtype=bool) self.eps = [] # initialize last episode self.last_ep = -1 except BaseException: # if the environment can't be create, we still can load if numAgents == 2 or numAgents == 3: self.n_cards = 5 elif numAgents == 4 or numAgents == 4: self.n_cards = 4 else: print("ERROR: invalid number of players") return self.n_moves = numAgents * 10 + self.n_cards * 2 print("WARNING: the environment could not be created.") print( "Some functionality may be compromised. You CAN still load data." )
def __init__(self, numAgents, numEpisodes): self.eps = numEpisodes self.players = numAgents self.env = rl_env.make(num_players=numAgents)
def load_env(variant="Hanabi-Full", players=4): pyhanabi_env = rl_env.make(environment_name=variant, num_players=players) py_env = pyhanabi_env_wrapper.PyhanabiEnvWrapper(pyhanabi_env) return py_env
def main(args): """ Observations & actions generation. Generate binary observations & one-hot encoded action vectors based on game logs from running WTFWT agent. Observations are saved in the following format: turn 0 ... turn n turn 0 ... turn n Game 0 [[[obs_0], ..., [obs_n]], [[act_0], ..., [act_n]], Game 1 [[obs_0], ..., [obs_n]], [[act_0], ..., [act_n]], ... Game m [[obs_0], ..., [obs_n]], [[act_0], ..., [act_n]]] Arguments: - args: Namespace Arguments taken from command line. To see details, run python3 create_WTFWT_data.py --help Raises: - Assertion errors for mismatches in WTFWT and DM HanabiEnv - Value Errors for parsing unknown formats. """ print('Seed used: %d' % args.seed) # Handle by build_env.sh # Make hanabi_env & import it # run('(cd {}/ && cmake -Wno-dev . && make)'.format(PATH_HANABI_ENV), args.q) import rl_env random.seed(args.seed) combined_data = [] # For specified number of games for i in range(args.num_games): game_data = [[], []] # Generate the game logs and decks s = random.randint(0, 2**31 - 1) # seed for WTFWT cmd = ('cargo run -q --manifest-path {}/WTFWT/Cargo.toml -- -n 1 -o 1 ' '-s {} -p {} -g info').format(PATH_ORIGINAL_AGENTS, s, args.num_players) debug = ['', ' -l debug'][args.debug] run(cmd + debug, args.q) with open('dk_cards.csv') as f_dk, open('rust_agent.csv') as f_log: reader = csv.reader(f_dk) dk = next(reader)[0].upper() # Deck in Rust Env starts from right and indexed from 1 dk = [x[0] + str(int(x[1]) - 1) for x in dk.split('-')[::-1]] env = rl_env.make('Hanabi-Full', num_players=args.num_players) obs = env.reset(dk) header = (['pid', 'turn'] + ['p%d_cards' % i for i in range(args.num_players)] + [ 'discards', 'action', 'firework', 'rem_life', 'rem_info', 'rem_deck' ]) reader = csv.reader(f_log) # For each turn in a game for row in reader: row = dict(zip(header, row)) if args.debug: comp_test(env, row, obs, args) action = parse_action(row, args.num_players) # Store the data cur_obs = obs['player_observations'][obs['current_player']] vec_act = one_hot_vectorized_action(action, env.num_moves(), cur_obs) game_data[0].append(b2int.convert(cur_obs['vectorized'])) game_data[1].append(vec_act) # Advance the state obs, reward, done, info = env.step(action) assert (done is True) combined_data.append(game_data) savepath = os.path.join( args.savedir, 'wtfwt_' + str(args.num_players) + '_' + str(args.num_games) + '.pkl') with open(savepath, 'wb') as f: pickle.dump(combined_data, f) os.remove('dk_cards.csv') os.remove('rust_agent.csv')
def load_hanabi_env(env_name="Hanabi-Full", num_players=4): pyhanabi_env = rl_env.make(environment_name=env_name, num_players=num_players) py_env = pyhanabi_env_wrapper.PyhanabiEnvWrapper(pyhanabi_env) return py_env
def run(self): """Run episodes.""" gin_files = ['agents/rainbow/configs/hanabi_rainbow_explicit.gin'] run_experiment.load_gin_configs(gin_files, []) environment = rl_env.make('Hanabi-Full-CardKnowledge', num_players="2") #environment_name == "Hanabi-Full-CardKnowledge"): obs_stacker = run_experiment.create_obs_stacker(environment) agent = run_experiment.create_agent( environment, obs_stacker) #verify it uses rainbow #get the checkpoint.. base_dir = "agents/rainbow/data" checkpoint_file_prefix = "ckpt" checkpoint_dir = '{}/checkpoints'.format(base_dir) experiment_logger = logger.Logger('{}/logs'.format(base_dir)) run_experiment.initialize_checkpointing(agent, experiment_logger, checkpoint_dir, checkpoint_file_prefix) obs_stacker.reset_stack() observations = environment.reset( ) # Full game observation, not to be passed to agents current_player, legal_moves, observation_vector = ( run_experiment.parse_observations(observations, environment.num_moves(), obs_stacker)) has_played = {current_player} action = agent.begin_episode(current_player, legal_moves, observation_vector) ### Stage-compliant printing related # observations["player_observations"] has an element for every player hands_list = get_list_cards(observations) pp(hands_list[0]) pp(hands_list[1]) hand_count = len(hands_list[0]) step_list = [] is_done = False reward_since_last_action = np.zeros(environment.players) score = 0 while not is_done: pp("~~~~") # convert action into dict-like action lm = observations["player_observations"][current_player][ "legal_moves"] lmi = observations["player_observations"][current_player][ "legal_moves_as_int"] ind = 0 action_unf = None for ind in range(0, len(lm)): if lmi[ind] == action: action_unf = lm[ind] assert (action_unf is not None) ### Stage compliant printing #print(format_step(cpacopy, hand_count)) ############ do the step (important!) observations, reward, is_done, _ = environment.step(action.item()) pp(observations["current_player"], "doing", action_unf) i = 0 for x in observations["player_observations"]: i += 1 pp(i, "player") for k in x["observed_hands"]: pret_list(k) cpa = action_unf["action_type"] cpacopy = action_unf.copy() if cpa == "PLAY" or cpa == "DISCARD": new_card = find_new_card(observations) pp("new_card:", new_card) cpacopy["new_card"] = new_card step_list.append(cpacopy) pp(observations["player_observations"][0]["fireworks"]) # quit if done if is_done: fire = observations["player_observations"][0]["fireworks"] score = 0 for x in fire: score += fire[x] print("Score=", score, fire) break current_player, legal_moves, observation_vector = ( run_experiment.parse_observations(observations, environment.num_moves(), obs_stacker)) if current_player in has_played: action = agent.step(reward_since_last_action[current_player], current_player, legal_moves, observation_vector) else: # Each player begins the episode on their first turn (which may not be # the first move of the game). action = agent.begin_episode(current_player, legal_moves, observation_vector) has_played.add(current_player) #cur_ply = observation["current_player"] #observation = observations['player_observations'][cur_ply] #current_player_action = agent.act(observation) # Make an environment step. #print('Agent: {} action: {}'.format(observation['current_player'], # current_player_action)) ##hands_list = get_list_cards(observations) # for h in hands_list: # i += 1 # print(i, format_hand(h)) ######observations, reward, done, unused_info = environment.step( ########current_player_action) ### Finding the new card... ### ##cpa = current_player_action["action_type"] ##cpacopy = current_player_action.copy() ##if cpa == "PLAY" or cpa == "DISCARD": ## cpacopy["new_card"] = find_new_card(observations) #print("new:",format_card(new_card)) ### Stage compliant printing #print(format_step(cpacopy, hand_count)) ##step_list.append(cpacopy) ### ### #print('Running episode: %d' % episode) #print('Max Reward: %.3f' % max(rewards)) ### Stage compliant printing #pret_list(hands_list) s = "" for h in hands_list: s += format_hand(h) + "\n" for x in step_list: s += format_step(x, hand_count) + "\n" with open("test_{}".format(score), "w") as f: f.write(s) print(s)