def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry # print(pommerman.REGISTRY) # Create a set of agents (exactly four) agent_list = [ agents.PlayerAgent(), agents.RandomAgent(), agents.RandomAgent() # agents.DockerAgent("pommerman/simple-agent", port=12345), ] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFACompetition-v0', agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) print('Episode {} finished'.format(i_episode)) env.close()
def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.RandomAgent(), agents.RandomAgent(), #agents.DockerAgent("multiagentlearning/hakozakijunctions", port=12345), #agents.DockerAgent("multiagentlearning/eisenach", port=12345), agents.DockerAgent("multiagentlearning/skynet955", port=12345), ] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeTeamCompetition-v1', agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) print('Episode {} finished'.format(i_episode)) print("Final Result: ", info) env.close()
def main(): '''Simple function to bootstrap a game. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Create a set of agents (exactly four) agent_list = [ MyAgent(), agents.RandomAgent(), agents.SimpleAgent(), agents.RandomAgent(), ] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFACompetition-v0', agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False while not done: # This renders the game env.render() # This is where we give an action to the environment actions = env.act(state) # This performs the step and gives back the new information state, reward, done, info = env.step(actions) print('Episode: {:2d} finished'.format(i_episode)) env.close()
def run_game(self, env_name): # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent(), agents.RandomAgent(), # agents.DockerAgent("pommerman/simple-agent", port=12345), ] # Limit the agents for one vs one if 'oneVsOne' in env_name: agent_list = agent_list[:2] env = pommerman.make(env_name, agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False while not done: # env.render() actions = env.act(state) state, reward, done, info = env.step(actions) print('Episode {} finished'.format(i_episode)) env.close()
def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent(), agents.RandomAgent(), # agents.DockerAgent("pommerman/simple-agent", port=12345), ] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFACompetition-v0', agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False env.render() img = env._viewer.get_buffer().get_texture().get_image_data() while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) # data = img.get_data("RGB", img.width * 3) # arr = np.frombuffer(data, dtype=np.uint8) # reshaped_array = arr.reshape(img.width, img.height, 3) print('Episode {} finished'.format(i_episode)) env.close()
def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Create a set of agents (exactly four) shape = (14, 11, 11) n_actions = 6 n_filters_per_layer = 64 n_cnn_layers = 4 nn_model = CNNBatchNorm(input_feature_shape=shape, n_actions=n_actions, n_filters_per_layer=n_filters_per_layer, n_cnn_layers=n_cnn_layers) nn_path = './output/NN_MODELS/ppo_CNN4_64_199.pt' #CHANGE THIS to your actual checkpoint file. Currently assumes you are calling the script from main dir nn_model.load_state_dict( torch.load(nn_path, map_location=lambda storage, loc: storage)) selection = 'softmax' nn_agent = NNAgent(nn_model, action_selection=selection, is_training=False) nn_agent2 = NNAgent(nn_model, action_selection=selection, is_training=False) idx = 0 team_id = (idx + 2) % 4 #env_id="PommeFFACompetition-v0" #env_id="PommeTeamCompetition-v0" env_id = "SimpleTeam-v0" agent_list = [ agents.RandomAgent(), agents.SlowRandomAgentNoBomb(), agents.RandomAgent(), agents.SlowRandomAgentNoBomb(), #agents.PlayerAgent(), #agents.RandomAgent(), ] agent_list[idx] = nn_agent agent_list[team_id] = nn_agent2 # Make the environment using the agent list env = pommerman.make(env_id, agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False while not done: env.render() actions = env.act(state) #a=nn_agent.act(state[idx], env.action_space, 'softmax') if nn_agent.is_alive else 0 #actions[idx]=a #print('actions', actions, 'nn alive', nn_agent.is_alive) state, reward, done, info = env.step(actions) #if nn_agent.is_alive ==False: print('dead') print('Episode {} finished'.format(i_episode)) print("Final Result: ", info) env.close()
def main(): # Print all possible environments in the Pommerman registry print(pommerman.registry) # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent(), agents.RandomAgent(), # agents.DockerAgent("pommerman/simple-agent", port=12345), ] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFA-v0', agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) print('Episode {} finished'.format(i_episode)) env.close()
def main(): opponents = [ agents.SimpleAgent(), agents.RandomAgent(), agents.RandomAgent(), ] _train(opponents, train_from_scratch=True) _test(opponents, 100, render=False)
def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Create a set of agents (exactly four) lstm_nn_model = CNN_LSTM(input_feature_shape=(9, 11, 11), n_actions=6, n_filters_per_layer=64, n_cnn_layers=6) lstm_nn_model.load_state_dict( torch.load( '/home/cgao3/pommerman/pommerman/agents/LOGS/ppo_cnn_lstm_cnn_6_64_27.pt', map_location=lambda storage, loc: storage)) # torch.load('my_file.pt', map_location=lambda storage, loc: storage) #for map CUDA pt to CPU nn_agent = NNAgent(lstm_nn_model) nn_agent2 = NNAgent(lstm_nn_model) idx = 0 team_id = (idx + 2) % 4 #env_id="PommeFFACompetition-v0" env_id = "PommeTeamCompetition-v0" agent_list = [ #nn_agent, agents.RandomAgent(), agents.SimpleAgent(), agents.RandomAgent(), #agents.SimpleAgent(), agents.RandomAgent(), # agents.DockerAgent("pommerman/simple-agent", port=12345), ] agent_list[idx] = nn_agent agent_list[team_id] = nn_agent2 # Make the "Free-For-All" environment using the agent list env = pommerman.make(env_id, agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(20): state = env.reset() done = False while not done: #env.render() actions = env.act(state) #a=nn_agent.act(state[idx], env.action_space, 'softmax') if nn_agent.is_alive else 0 #actions[idx]=a print('actions', actions, 'nn alive', nn_agent.is_alive) state, reward, done, info = env.step(actions) #if nn_agent.is_alive ==False: print('dead') print('Episode {} finished'.format(i_episode)) env.close()
def __init__(self, config=None): #self._observation_spec = ['board', 'bomb_blast_strength', 'bomb_life', 'position', 'ammo', 'blast_strength', 'can_kick', 'teammate', 'enemies', 'message'] self.adversarial = config['adversarial'] # Create a set of agents (exactly four) agent_list = [ agents.RandomAgent(), agents.RandomAgent(), agents.RandomAgent(), agents.RandomAgent(), ] # Make the "Free-For-All" environment using the agent list self._env = pommerman.make('PommeRadioCompetition-v2', agent_list)
def main(): """Simple function to bootstrap a game""" # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent(), agents.HttpAgent(port=10080, host="localhost"), ] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFACompetition-v0', agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) print('Episode {} finished'.format(i_episode)) env.close()
def __init__(self, env_id, random_side=True, agent_list=None, rule_agents=[], replay_dir=None, n_player=4): self.n_player = n_player self.base_agents = [agents.RandomAgent() for _ in range(n_player)] if agent_list is None: self.agent_list = self.base_agents else: assert isinstance(agent_list, str) agent_list = agent_list.split(',') assert len(agent_list) == n_player self.agent_list = [ helpers.make_agent_from_string(agent, i) for i, agent in enumerate(agent_list) ] # Make the environment using the agent list env = pommerman.make(env_id, self.agent_list) if agent_list is not None: for id_, agent in enumerate(self.base_agents): agent.init_agent(id_, env.spec._kwargs['game_type']) super(PommeBase, self).__init__(env) self.rule_agents = rule_agents self._random_side = random_side self.random_side() self._uuid = str(uuid.uuid1())[:8] self._replay_dir = replay_dir self._replay_data = {"mode": str(env_id)}
def main(): # Instantiate the environment agent_list = [ agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent(), ddpg_agent, # agents.DockerAgent("pommerman/simple-agent", port=12345), ] env = pommerman.make(args.env_name, agent_list) env.seed(RANDOM_SEED) # Random seed agent_num = 0 env = EnvWrapper(env, num_agent=agent_num) # Generate training data stimulator = save_episodes(env) stimulator.stimulate() observations = [] actions = [] rewards = [] for episode in stimulator.episodes: observations.append(episode.observations) actions.append(episode.actions) rewards.append(episode.reward) observations_merged = np.concatenate(observations) actions_merged = np.concatenate(actions) rewards_merged = np.concatenate(rewards) np.save(train_data_obs, observations_merged) np.save(train_data_labels, actions_merged) np.save(train_data_reward, rewards_merged)
def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Generate a json every 5 episodes json_check = 5 # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent(), agents.RandomAgent(), ] deep_agents = 'test::agents.SimpleAgent,test::agents.RandomAgent,test::agents.RandomAgent,test::agents.SimpleAgent' #agents.DockerAgent("multiagentlearning/hakozakijunctions", port=12345), #agents.DockerAgent("multiagentlearning/eisenach", port=12345), #agents.DockerAgent("multiagentlearning/skynet955", port=12345), # Make the "Free-For-All" environment using the agent list config = 'PommeFFACompetition-v0' #config = 'PommeTeamCompetition-v1' env = pommerman.make(config, agent_list) # Run the episodes just like OpenAI Gym for i_episode in range(20): if i_episode % json_check == 0: fight.run( config, deep_agents, record_json_dir="test_json/test_json" + str(i_episode) ) # GIVES ME ERROR DURING env.save_json for anything except FFA else: state = env.reset() done = False while not done: actions = env.act(state) state, reward, done, info = env.step(actions) print('Episode {} finished'.format(i_episode)) print("Final Result: ", info) env.close()
def _thunk(): agent_list = [ # agents.SimpleAgent(), agents.RandomAgent(), agents.BaseAgent(), agents.SimpleAgent(), agents.SimpleAgent() ] env = pommerman.make(env_id, agent_list) return env
def main(): tf.reset_default_graph() # Print all possible environments in the Pommerman registry # print(pommerman.registry) sess = tf.Session() # sess.run(tf.global_variables_initializer()) # sess = tf_debug.TensorBoardDebugWrapperSession(sess, 'localhost:6064') # Create a set of agents (exactly four) ddpg_agent = DdpgAgent(id=3, sess=sess) agent_list = [ agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent(), ddpg_agent, # agents.DockerAgent("pommerman/simple-agent", port=12345), ] env = pommerman.make(args.env_name, agent_list) env.seed(RANDOM_SEED) print('HERE0', sess) ddpg_agent.train_transformer(sess, env) print('her2') print(9 / 0) r_sum = np.zeros(1) for i in range(args.num_steps): # Make the "Free-For-All" environment using the agent list env.reset() # Run the episodes just like OpenAI Gym for i_episode in range(args.max_episode_length): state = env.reset() done = False while not done: # if args.display: # env.render() actions = env.act(state) state, reward, done, info = env.step(actions) r_sum[i] += reward[0] if i_episode > 300: break print('Game {} finished'.format(i)) np.savetxt(args.outdir + '/result_2simple_2random.csv', r_sum, fmt='%1.4e') env.close()
def generate_data(EPISODES, save_file_nm, shuffle_agents=False): rnn_agent = RNN_Agent() # Init dataset dset = dataset(rnn_agent.RNN_SEQUENCE_LENGTH, save_file_nm, rnn_agent.utils) if os.path.exists(save_file_nm): dset.load() agent_list = [ rnn_agent, agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent() ] rnn_agent_index = agent_list.index(rnn_agent) if shuffle_agents: shuffle(agent_list) env = pommerman.make('PommeFFACompetition-v0', agent_list) wins = {} iter_num = 0 for an_episode in range(EPISODES): state = env.reset() #------------------------------------------------------------------- done = False episode_obs = [] episode_acts = [] #while not done and rnn_agent.is_alive: while not done: #env.render() actions = env.act(state) episode_acts.append(actions[rnn_agent_index]) episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index])) state, reward, done, info = env.step(actions) iter_num += 1 #------------------------------------------------------------------- # Final timestep observation episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index])) dset.add_episode(episode_obs, episode_acts) #print(info) #print("Median Act Time: {} seconds".format(np.median(np.array(rnn_agent.act_times)))) env.close() dset.save() rnn_agent.sess.close() tf.reset_default_graph()
def load_game(self): self.hide() with open('./replay/000.pickle', 'rb') as f: replay_game = pickle.load(f) num_players = replay_game.pop() agents_list = [agents.RandomAgent() for i in range(num_players)] env = pommerman.make('PommeFFACompetition-v0', agents_list, game_state_file='./replay/000.json') # Run the episodes just like OpenAI Gym env.reset() for actions in replay_game: env.render() env.step(actions) env.close() self.show()
def main(): env = pommerman.make('PommeFFACompetition-v0', [ agents.PlayerAgent(), agents.SimpleAgent(), agents.RandomAgent(), StoppingAgent(), ]) for i_episode in range(1): state = env.reset() done = False while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) if done: win_player = info['winners'][0] + 1 print(f'win {win_player}P') print(f'Episode {i_episode} finished') env.close()
def main(): # Print all possible environments in the Pommerman registry print(pommerman.registry) # sess = tf.Session() #sess = tf_debug.TensorBoardDebugWrapperSession(sess, 'localhost:6064') # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent(), agents.RandomAgent(), # agents.DockerAgent("pommerman/simple-agent", port=12345), ] env = pommerman.make('PommeFFACompetition-v0', agent_list) # Create the Estimator estimator_nn1 = tf.estimator.Estimator(model_fn=model_NN1, model_dir="/tmp/sa_nn1") # Set up logging for predictions tensors_to_logNN1 = {"probabilities": "softmax_tensor"} logging_hook_nn1 = tf.train.LoggingTensorHook(tensors=tensors_to_logNN1, every_n_iter=50) # Create the Estimator estimator_nn2 = tf.estimator.Estimator(model_fn=model_NN2, model_dir="/tmp/sa_nn2") # Set up logging for predictions tensors_to_logNN2 = {"probabilities": "softmax_tensor"} logging_hook_nn2 = tf.train.LoggingTensorHook(tensors=tensors_to_logNN2, every_n_iter=50) r_sum = np.zeros(1) for i in range(1): # Make the "Free-For-All" environment using the agent list env.reset() # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False curr_state = None prev_state = None graph = np.random.rand(4, 30).astype("float32") + 0.0001 # print(graph) pr_action = None pr_pr_action = None while not done: # env.render() actions = env.act(state) state, reward, done, info = env.step(actions) r_sum[i] += reward[0] # as basic implementation I consider only one agent prev_state = curr_state curr_state = state if pr_pr_action is not None: # Train the model for agent_num in range(4): train_input_NN2 = tf.estimator.inputs.numpy_input_fn( x={"state1": np.resize( state_to_matrix_with_action(curr_state[agent_num], action=pr_action[agent_num])\ .astype("float32"), (1, 49*11)), "graph": np.resize(graph, (1, 4*30)) }, y=np.asarray([actions[agent_num]]), batch_size=1, num_epochs=None, shuffle=True) train_input_NN1 = tf.estimator.inputs.numpy_input_fn( x={"state1": np.resize( state_to_matrix_with_action(prev_state[agent_num], action=pr_pr_action[agent_num])\ .astype("float32"), (1, 49 * 11)), "state2": np.resize( state_to_matrix_with_action(curr_state[agent_num], action=pr_action[agent_num])\ .astype("float32"), (1, 49 * 11))}, y=np.asmatrix(graph.flatten()), batch_size=1, num_epochs=None, shuffle=True) estimator_nn1.train(input_fn=train_input_NN1, steps=200, hooks=[logging_hook_nn1]) estimator_nn2.train(input_fn=train_input_NN2, steps=200, hooks=[logging_hook_nn2]) predictions = estimator_nn2.predict( input_fn=train_input_NN2) #next_action = np.array(list(p['classes'] for p in predictions)) pr_pr_action = pr_action pr_action = actions if i_episode > 300: break print('Game {} finished'.format(i)) np.savetxt('result_2simple_2random.csv', r_sum, fmt='%1.4e') env.close()
def train_C_generate_data(EPISODES, save_file_nm, chk_point_folder, sess_save_step=100, load_model=None, shuffle_agents=False, record=False, plot_reward=False, add_agents=[ agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent() ], encourage_win=False, learn=True): if plot_reward: plt.xlabel('Episode #') plt.ylabel('Average reward for last 100 episodes') # Init the agent rnn_agent = RNN_Agent(model_training='C') # For saving model saver = tf.train.Saver() if not os.path.exists(chk_point_folder): os.makedirs(chk_point_folder) # Try to recover previous model if load_model is not None: load_folder = load_model else: load_folder = chk_point_folder latest_model = tf.train.latest_checkpoint(load_folder) if latest_model is not None: saver.restore(rnn_agent.sess, latest_model) print("Restored ", latest_model) # Init dataset if record: dset = dataset(rnn_agent.RNN_SEQUENCE_LENGTH, save_file_nm, rnn_agent.utils) if os.path.exists(save_file_nm): dset.load() # TensorBoard writer experimentFolder = datetime.now().isoformat(timespec='minutes') C_writer = tf.summary.FileWriter( './tboard/train_C_{}_{}'.format( save_file_nm.split('.')[0], experimentFolder), rnn_agent.sess.graph) rnn_agent.summary_writer = C_writer agent_list = [rnn_agent] + add_agents if shuffle_agents: shuffle(agent_list) rnn_agent_index = agent_list.index(rnn_agent) env = pommerman.make('PommeFFACompetition-v0', agent_list) mean_rewards_list = [] episode_history = deque(maxlen=100) ties = deque(maxlen=100) rnn_wins = deque(maxlen=100) other_wins = deque(maxlen=100) for i_episode in range(EPISODES): # initialize state = env.reset() prev_state = np.copy(state) total_rewards = 0 #------------------------------------------------------------------- done = False episode_obs = [] episode_acts = [] #while not done and rnn_agent.is_alive: t = 0 wins = {} while not done and rnn_agent.is_alive: t += 1 #env.render() actions = env.act(state) episode_acts.append(actions[rnn_agent_index]) episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index])) state, reward, done, info = env.step(actions) if not encourage_win: reward[rnn_agent_index] = reward[ rnn_agent_index] if not rnn_agent.is_alive else 0.1 else: reward[rnn_agent_index] = reward[ rnn_agent_index] if not rnn_agent.is_alive else 0.09 if encourage_win and done and 'winners' in info: reward[rnn_agent_index] = 5 if info['winners'][ 0] == rnn_agent_index else -5 #print("t: {} \t reward: {}\t Agent alive: {}".format(t, reward[rnn_agent_index], rnn_agent.is_alive) ) total_rewards += reward[rnn_agent_index] rnn_agent.storeRollout( np.concatenate( (rnn_agent.utils.input(prev_state[rnn_agent_index]), rnn_agent.rnn_state)), actions[rnn_agent_index], reward[rnn_agent_index]) prev_state = np.copy(state) #------------------------------------------------------------------- if 'winners' in info: rnn_wins.append(1 if info['winners'][0] == rnn_agent_index else 0) other_wins.append( 1 if info['winners'][0] != rnn_agent_index else 0) wins_ratio = np.mean(other_wins) / np.mean(rnn_wins) tflog('Other wins/agent wins ratio (100 wins)', wins_ratio) #print('Other wins/agent wins ratio (100 wins)', wins_ratio) ties.append(1 if 'Tie' in info else 0) tie_ratio = np.mean(ties) / np.mean(rnn_wins) #tflog('ties/agent wins ratio (100 steps)', tie_ratio) # Final timestep observation episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index])) if record: dset.add_episode(episode_obs, episode_acts) rnn_agent.update_C() episode_history.append(total_rewards) mean_rewards = np.mean(episode_history) print("Episode {}".format(i_episode)) print("Finished after {} timesteps".format(t + 1)) print("Reward for this episode: {}".format(total_rewards)) print("Average reward for last 100 episodes: {:.2f}".format( mean_rewards)) mean_rewards_list.append(mean_rewards) #tflog('Iteration Number', rnn_agent.train_iteration) tflog('Average reward for last 100 episodes', mean_rewards) # Save the model if i_episode % sess_save_step == 0: if learn: saver.save(rnn_agent.sess, chk_point_folder, global_step=rnn_agent.C_step) if record: dset.save() # Plot rewards if plot_reward: x = np.arange(i_episode + 1) # Linear Reg fit = np.polyfit(x, mean_rewards_list, 1) fit_fn = np.poly1d(fit) plt.plot(x, mean_rewards_list, '.', x, fit_fn(x), '--k') plt.savefig("test.png") plt.gcf().clear() #print(info) print("Median Act Time: {} seconds".format( np.median(np.array(rnn_agent.act_times)))) env.close() rnn_agent.sess.close() tf.reset_default_graph()
def main(): if platform.system() == 'Darwin': os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' process = psutil.Process(os.getpid()) # MARK: - Create the environment agent_list = [ agents.RandomAgent(), agents.SimpleAgent(), # agents.SimpleAgent(), # agents.SimpleAgent(), ] env = pommerman.make('OneVsOne-v0', agent_list, render_mode='human') # MARK: - Allowing to save the model now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") checkpoint_path = os.path.join(".", "models", now, "-{epoch:04d}.ckpt") # MARK: - Log for tensorboard log_dir = os.path.join( "logs", now, ) tensorflow_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, profile_batch=5, histogram_freq=1) file_writer_rewards = tf.summary.create_file_writer(log_dir + "/metrics") # =========== (HYPER)PARAMETERS AND VARIABLES =========== # LIST_SIZE = 10000 D = deque(maxlen=LIST_SIZE) DISCOUNT_RATE = 0.8 TAU = 0 MAX_TAU = 1000 ACTION_SPACE = env.action_space.n TIME_CHANNELS_SIZE = 1 INPUT_SHAPE = list(env.get_observation_space()) + [TIME_CHANNELS_SIZE] BATCH_SIZE = 32 N = BATCH_SIZE N_EPISODES = 1000 EXPLORATION_BASE = 1.02 EXPLORATION_RATE = 1 MINIMAL_EXPLORATION_RATE = 0.01 TD_ERROR_DEFAULT = 0 FRAME_COUNT = 0 print(f"Pixel space of the game {INPUT_SHAPE}") # ================== CONTINUE TRAIN FROM LOADED MODEL ==================== # # approximator_model = create_model(INPUT_SHAPE, ACTION_SPACE) # target_model = create_model(INPUT_SHAPE, ACTION_SPACE) # # MODEL_PATH = "models/20200119-121818" # latest = tf.train.latest_checkpoint(MODEL_PATH) # print(f"Loading model from {latest}") # # approximator_model.load_weights(latest) # target_model.load_weights(latest) # ======================================================================== # # =================== START WITH NEW MODEL =============================== # approximator_model = create_model(INPUT_SHAPE, ACTION_SPACE) target_model = create_model(INPUT_SHAPE, ACTION_SPACE) # ======================================================================== # # ===== INITIALISATION ====== acc_nonzeros = [] actions_available = [ str(action).split(".")[1] for action in constants.Action ] print("Running the init") for n in range(N): if FRAME_COUNT > BATCH_SIZE: break state_obs = env.reset() done = False while not done: FRAME_COUNT += 1 actions_all_agents = env.act(state_obs) state_obs, reward, done, info, pixels = env.step2( actions_all_agents) D.append([ preprocess(pixels), reward[0], actions_all_agents[0], reward[0], done ]) print('Init episode {} finished'.format(n)) for episode in range(N_EPISODES): start_time = time.time() if TAU >= MAX_TAU: TAU = 0 # Copy the weights from policy model to target model target_model.set_weights(approximator_model.get_weights()) print("===> Updated weights") EXPLORATION_RATE = np.power( EXPLORATION_BASE, -episode ) if EXPLORATION_RATE > MINIMAL_EXPLORATION_RATE else MINIMAL_EXPLORATION_RATE # EXPLORATION_RATE = 1 - (episode * 1 / N_EPISODES) if EXPLORATION_RATE > MINIMAL_EXPLORATION_RATE else MINIMAL_EXPLORATION_RATE print( f"Running episode {episode} with exploration rate: {EXPLORATION_RATE}" ) # Intial step for the episode state_obs = env.reset() actions = env.act(state_obs) initial_observation, reward, done, info, pixels = env.step2( actions, render=True) state = preprocess(pixels) done = False # next_state = initial_state.copy() # To remove all the information of the last episode episode_rewards = [] frame_cnt = 0 acc_qs = [] acc_actions = [] acc_frames = [] action_str = "" while not done: # https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/ frame_cnt += 1 TAU += 1 actions_all_agents = env.act(state_obs) action = actions_all_agents[0] q_values = np.zeros((1, ACTION_SPACE)) is_explore = random.choices( (True, False), (EXPLORATION_RATE, 1 - EXPLORATION_RATE))[0] if not is_explore: # Greedy action init_mask = tf.ones([1, ACTION_SPACE]) init_state = state q_values = approximator_model.predict( [tf.reshape(init_state, [1] + INPUT_SHAPE), init_mask]) action = np.argmax(q_values) actions_all_agents[0] = action acc_qs.append(q_values[0]) # print( # action_str) if action_str != f"Action taken: {actions_available[action]}" else None state_obs, reward, done, info, pixels = env.step2( actions_all_agents) flipped = np.flip(pixels, (0)) acc_frames.append(flipped) acc_actions.append(action) episode_rewards.append(reward[0]) state = preprocess(pixels) D.append([ state, reward[0], actions_all_agents[0], TD_ERROR_DEFAULT, done ]) if (episode + 1) % 5 == 0: action_str = f"Action taken: {actions_available[action]} was {'greedy' if not is_explore else 'explored'}" print(action_str) memory_length = len(D) print(f"Number of frames in memory {memory_length}") # experience_batch = take_sample(D, approximator_model, target_model, BATCH_SIZE, ACTION_SPACE) ids, importance, max_td_err = take_sample(D, BATCH_SIZE, beta=1 - (episode / N_EPISODES)) TD_ERROR_DEFAULT = max_td_err experience_batch = [(D[idx], D[idx + 1]) if idx < memory_length - 1 else (D[idx - 1], D[idx]) for idx in ids] set_of_batch_states = tf.constant( [exp[0][0] for exp in experience_batch]) set_of_batch_next_states = tf.constant( [exp[1][0] for exp in experience_batch]) # Gather actions for each batch item set_of_batch_actions = tf.one_hot( [exp[0][2] for exp in experience_batch], ACTION_SPACE) # Maybe unnecessary - We are using the double q mask instead. next_q_mask = tf.ones([BATCH_SIZE, ACTION_SPACE]) set_of_batch_states = tf.cast(tf.reshape( set_of_batch_states, set_of_batch_states.shape + [1]), dtype=tf.float32) double_q_mask = tf.one_hot( tf.argmax(approximator_model.predict( [set_of_batch_states, next_q_mask]), axis=1), ACTION_SPACE) # http://arxiv.org/abs/1509.06461 set_of_batch_next_states = tf.cast(tf.reshape( set_of_batch_next_states, set_of_batch_next_states.shape + [1]), dtype=tf.float32) next_q_values = tf.constant( target_model.predict([set_of_batch_next_states, double_q_mask])) # Gather rewards for each batch item set_of_batch_rewards = tf.constant( [exp[0][1] for exp in experience_batch], dtype=next_q_values.dtype) episode_nonzero_reward_states = ( tf.math.count_nonzero(set_of_batch_rewards) / BATCH_SIZE) * 100 print( f"Number of information yielding states: {episode_nonzero_reward_states}" ) is_terminal = tf.constant( [0 if exp[1][4] else 1 for exp in experience_batch], dtype=next_q_values.dtype) next_q = set_of_batch_rewards + \ (DISCOUNT_RATE * tf.reduce_max(next_q_values, axis=1)) * is_terminal init_q_values = approximator_model.predict( [set_of_batch_states, set_of_batch_actions]) init_q = tf.reduce_sum(init_q_values, axis=1) td_error = (next_q - init_q).numpy() history = approximator_model.fit( [set_of_batch_states, set_of_batch_actions], next_q, batch_size=BATCH_SIZE, verbose=1, callbacks=[tensorflow_callback], sample_weight=importance) for idx, exp in enumerate(experience_batch): exp[0][3] = td_error[idx] # Wrap up loss = history.history.get("loss", [0])[0] time_end = np.round(time.time() - start_time, 2) memory_usage = process.memory_info().rss print(f"Current memory consumption is {memory_usage}") print( f"Loss of episode {episode} is {loss} and took {time_end} seconds") random_experience_idx = random.choice(range(len(experience_batch) - 1)) random_experience = experience_batch[random_experience_idx][0] random_experience_next = experience_batch[random_experience_idx][1] # print(tmp.shape) episode_image = plot_to_image( image_grid_pommerman(random_experience, random_experience_next, [action for action in constants.Action])) image_qs = utils.plot_to_image( utils.plot_q(np.array(acc_qs), [action for action in constants.Action])) image_pommerman = utils.plot_to_image( utils.show_pommerman_game(acc_frames, acc_actions, [action for action in constants.Action])) with file_writer_rewards.as_default(): tf.summary.scalar('episode_rewards', np.sum(episode_rewards), step=episode) tf.summary.scalar('episode_loss', loss, step=episode) tf.summary.scalar('episode_time_in_secs', time_end, step=episode) tf.summary.scalar('episode_nr_frames', frame_cnt, step=episode) tf.summary.scalar('episode_exploration_rate', EXPLORATION_RATE, step=episode) tf.summary.scalar('episode_mem_usage', memory_usage, step=episode) tf.summary.scalar('episode_frames_per_sec', np.round(frame_cnt / time_end, 2), step=episode) tf.summary.histogram('q-values', next_q_values, step=episode) tf.summary.image('q-values-over-time', image_qs, step=episode) tf.summary.image('pommerman-game', image_pommerman, step=episode) tf.summary.scalar('episode_mem_usage_in_GB', np.round(memory_usage / 1024 / 1024 / 1024), step=episode) tf.summary.image('episode_example_state', episode_image, step=episode) if (episode + 1) % 5 == 0: acc_nonzeros.append(episode_nonzero_reward_states) tf.summary.histogram('episode_nonzero_reward_states', acc_nonzeros, step=(episode + 1) // 5) else: acc_nonzeros.append(episode_nonzero_reward_states) if (episode + 1) % 50 == 0: model_target_dir = checkpoint_path.format(epoch=episode) approximator_model.save_weights(model_target_dir) print(f"Model was saved under {model_target_dir}")
def run(self): # If we move this to "init", we get an error on recursion depth self.A3CAgent = A3CAgent(self.lnet) self.agentList = [ self.A3CAgent, agents.SimpleAgent(), agents.RandomAgent(), agents.RandomAgent() ] self.env = env = pommerman.make('PommeFFACompetition-v0', self.agentList) total_step = 1 while self.g_ep.value < MAX_EP: # Step 2). worker interacts with environment s_act = self.env.reset() max_ammo = old_max_ammo = 1 ep_r = 0. self.render = False # self.g_ep.value % 20==0 self.A3CAgent.reset_lstm() if self.name == 'w0': enc1 = abs(torch.sum(self.gnet.encoder1.weight.data).item()) enc2 = abs(torch.sum(self.gnet.encoder2.weight.data).item()) enc3 = abs(torch.sum(self.gnet.encoder3.weight.data).item()) conv1 = abs(torch.sum(self.gnet.conv1.weight.data).item()) conv2 = abs(torch.sum(self.gnet.conv2.weight.data).item()) conv3 = abs(torch.sum(self.gnet.conv3.weight.data).item()) conv4 = abs(torch.sum(self.gnet.conv4.weight.data).item()) cl = abs(torch.sum(self.gnet.critic_linear.weight.data).item()) alstm1 = abs( torch.sum(self.gnet.actor_lstm.weight_ih_l0.data).item()) alstm2 = abs( torch.sum(self.gnet.actor_lstm.weight_hh_l0.data).item()) aout = abs(torch.sum(self.gnet.actor_out.weight.data).item()) f = open("AbsSummedWeights_ActorCritic_v2.txt", "a") f.write( '{0:.5f} \t {1:.5f} \t {2:.5f} \t {3:.5f} \t {4:.5f} \t {5:.5f} \t {6:.5f} \t {7:.5f} \t {8:.5f} ' '\t {9:.5f} \t {10:.5f} \n'.format(enc1, enc2, enc3, conv1, conv2, conv3, conv4, alstm1, alstm2, aout, cl)) f.close() while True: # only render worker 0 if self.name == 'w0' and self.render: self.env.render() agent_actions = self.env.act(s_act) a = agent_actions[self.agent_nr] self.saved_oh_actions[:, : -1] = self.saved_oh_actions[:, 1:] # time shift self.saved_oh_actions[:, -1] = self.empty_oh_action[:, 0] # erase last value self.saved_oh_actions[a, -1] = 1 # insert new one-hot s_new, rewards, done, _ = self.env.step(agent_actions) # not(10 in s_new[self.agent_nr]['alive']) #if done or agent 10 is dead done = done or rewards[self.agent_nr] == -1 max_ammo = max(max_ammo, s_act[self.agent_nr]['ammo']) # reward and buffer r = rewards[self.agent_nr] # if (10 in s_act[self.agent_nr]['alive']) and total_step!=1: # r = get_reward(s_new,s_act,self.agent_nr,r,max_ammo,old_max_ammo,a,a_old,self.saved_oh_actions) ep_r += r self.A3CAgent.add_reward(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net update_glob_net(self.opt, self.lnet, self.gnet, self.A3CAgent, GAMMA) if done: record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.global_nr_steps, s_new[self.agent_nr]['step_count'], self.name) break s_act = s_new old_max_ammo = max_ammo a_old = a total_step += 1 self.res_queue.put(None)
return (id + 10) in alive_agents def reset(self): self.prev_obs = self.env.reset() obs = {} self.reset_stat() for i in range(4): if self.is_agent_alive(i): obs[i] = featurize(self.prev_obs[i]) return obs if __name__ == '__main__': agent_list = [ agents.RandomAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] env = pommerman.make( 'PommeTeam-v0', agent_list, # '/home/lucius/working/projects/pomme_rllib/resources/one_line_state.json' ) obs = env.reset() while True: features = featurize(obs[0]) for i in range(17): print(features[i])
class PomFFA(gym.Env): agent_list = [ agents.RandomAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent() ] all_obs = None all_action = None cur_obs = None alive_agents = [10, 11, 12, 13] player_agent_id = 10 def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v print("pomme_config: ") print(pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) print(id_, pomm_config['game_type']) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs obs = self.preproess(obs) return obs def get_reward(self, obs, action, agent_id): if len(obs["alive"]) == 1: # An agent won. Give them +1, others -1. if agent_id in obs['alive']: return 1 else: return -1 if obs["step_count"] >= 500: # Game is over from time. Everyone gets -1. return -1 # Game running: 0 for alive, -1 for dead. if agent_id not in obs['alive']: return -1 # # x, y = obs["position"] # blast = obs["bomb_blast_strength"] # # for w in range(11): # if blast[x][w] > int(math.fabs(w-y)): # return -10 # # if blast[w][y] > int(math.fabs((w-x))): # return -10 return 0 def step(self, action): actions = self.pomme.act(self.all_obs) if self.alive_agents and self.player_agent_id in self.alive_agents: actions = self.set_for_training_agent(actions, action) else: actions = self.set_for_training_agent(actions, 0) obs, rewards, done, info = self.pomme.step(actions) # print(obs) self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs reward = self.get_reward(self.cur_obs, action, self.player_agent_id) self.alive_agents = obs['alive'] if (self.player_agent_id not in self.alive_agents) or obs["step_count"] >= 500: done = True obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.player_agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.player_agent_id - 10 inputs[order] = value return inputs def init_observation_space(self, env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] or 11 num_items = env_config['num_items'] or 11 print("env config: {}".format(env_config)) # board_size = 11 board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength }) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()
def __init__(self, sparrer_type, agent_id, model=None): super().__init__() if sparrer_type == constants.SIMPLE_SPARRER: self.modelled_env = pommerman.make('PommeTeamCompetition-v0', agent_list=[agents.SimpleAgent() for _ in range(4)]) elif sparrer_type == constants.MODEL_SPARRER: # FIXME: may require changes self.modelled_env = pommerman.make('PommeTeamCompetition-v0', agent_list=[agents.SmartAgent(model) for _ in range(4)]) elif sparrer_type == constants.RANDOM_SPARRER: # FIXME: may require changes self.modelled_env = pommerman.make('PommeTeamCompetition-v0', agent_list=[agents.RandomAgent() for _ in range(4)]) else: raise ValueError('Invalid sparrer type') self.training_examples = [] self.memory = None self.modelled_env.reset() self.agent_id = agent_id
def main(): # Create the environment agent_list = [ agents.RandomAgent(), agents.SimpleAgent(), # agents.SimpleAgent(), # agents.SimpleAgent(), ] env = pommerman.make('OneVsOne-v0', agent_list, render_mode='human') if platform.system() == 'Darwin': print("MacBook Pro user detected. U rule.") os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' # =========== STATS =========== # global now now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") accumulated_frames = [] episode_rewards = [] # =========== (HYPER)PARAMETERS AND VARIABLES =========== # ACTION_SPACE = env.action_space.n TIME_CHANNELS_SIZE = 1 INPUT_SHAPE = list(env.get_observation_space()) + [TIME_CHANNELS_SIZE] N_EPISODES = 10000 MODEL_PATH = "models/20200122-104419" latest = tf.train.latest_checkpoint(MODEL_PATH) print(f"Loading model from {latest}") ## - Comment 2 lines below for running Random/SimpleAgent- ## restored_model = create_model(INPUT_SHAPE, ACTION_SPACE) restored_model.load_weights(latest) actions_available = [str(action).split(".")[1] for action in constants.Action] for episode in range(N_EPISODES): start_time = time.time() print( f"Running episode {episode}.") # Intial step for the episode state_obs = env.reset() actions = env.act(state_obs) initial_observation, reward, done, info, pixels = env.step2( actions, render=True) state = preprocess(pixels) done = False frame_cnt = 0 accumulated_reward = 0 action_str = "" while not done: frame_cnt += 1 actions_all_agents = env.act(state_obs) ## - Comment out from here - ## init_mask = tf.ones([1, ACTION_SPACE]) init_state = state q_values = restored_model.predict( [tf.reshape(init_state, [1] + INPUT_SHAPE), init_mask]) action = np.argmax(q_values) # print(q_values) # print( # f"Action taken: {actions_available[action]}") if action_str != f"Action taken: {actions_available[action]}" else None actions_all_agents[0] = action ## - Until here, when you want to use a Random/SimpleAgent instead - ## state_obs, reward, done, info, pixels = env.step2( actions_all_agents) state = preprocess(pixels) accumulated_reward += reward[0] action_str = f"Action taken: {actions_available[action]}" time_end = np.round(time.time() - start_time, 2) accumulated_frames.append(frame_cnt) episode_rewards.append(accumulated_reward) save_json(accumulated_frames=accumulated_frames, episode_rewards=episode_rewards) print(f"Running at {np.round(frame_cnt / time_end)} frames per second")
def __init__(self): self._agent = agents.RandomAgent()