def test_marl(task, VectorEnv, obs_type='ram'): """ Test env parallel DummyVectorEnv (no multiprocess) & SubprocVectorEnv (multiprocess) for multi-agent pettingzoo games. Use EnvVec Wrappers from Tianshou. """ # env = eval(task).parallel_env(obs_type=obs_type) env_num = 2 envs = VectorEnv( [lambda: make_env(task, obs_type=obs_type) for _ in range(env_num)]) print(envs.action_space) assert len(envs) == env_num # envs.seed(2) # which is equal to the next line envs.seed(np.random.randint(1000, size=env_num).tolist()) # envs.seed([2, 3, 4, 5, 6, 7, 8, 9]) # set specific seed for each env obs = envs.reset() # reset all environments # obs = envs.reset([0, 5, 7]) # reset 3 specific environments for i in range(30000): print(i) actions = [{ 'first_0': np.random.randint(18), 'second_0': np.random.randint(18) } for i in range(env_num)] obs, r, done, info = envs.step(actions) # step synchronously envs.render() # render all environments print(r) envs.close() # close all environments
def test_marl_baseline(task, VectorEnv, obs_type='ram'): """ Test env parallel DummyVectorEnv (no multiprocess) & SubprocVectorEnv (multiprocess) for multi-agent pettingzoo games. Use EnvVec Wrappers from stable-baseline. """ # env = eval(task).parallel_env(obs_type=obs_type) env_num = 2 envs = VectorEnv( [lambda: make_env(task, obs_type=obs_type) for _ in range(env_num)]) envs.seed(2) # which is equal to the next line obs = envs.reset() # reset all environments for i in range(30000): print(i) actions = [{ 'first_0': np.random.randint(18), 'second_0': np.random.randint(18) } for i in range(env_num)] obs, r, done, info = envs.step(actions) # step synchronously # envs.render() # cannot render for stable-baseline env vec wrappers envs.close() # close all environments
parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, help="Environment name to use, default=" + DEFAULT_ENV_NAME) parser.add_argument("-r", "--record", help="Directory to store video recording") parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize', help="Disable visualization of the game play") args = parser.parse_args() env = wrappers.make_env(args.env) if args.record: env = gym.wrappers.Monitor(env, args.record) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) root = tf.train.Checkpoint( optimizer=optimizer, model=net, optimizer_step=tf.train.get_or_create_global_step()) root.restore(tf.train.latest_checkpoint(args.model)) state = env.reset() total_reward = 0.0 c = collections.Counter()
def parallel_rollout(id, env_name, model, writer, max_eps, max_timesteps, selfplay_interval, render, \ model_path, against_baseline=False, selfplay=False, fictitious=False, seed=0): """ Paralllel rollout for multi-agent games, in contrast to the iterative rollout manner. Parallel: (multi-agent actions are executed in once call of env.step()) observations_, rewards, dones, infos = env.step(actions) actions, observations_, rewards, dones, infos are all dictionaries, with agent name as key and corresponding values. """ env = make_env(env_name, seed, obs_type=obs_type) env.reset() # required by env.agents score = {a: 0.0 for a in env.agents} print_interval = 20 save_interval = 100 epi_len = [] for n_epi in range(max_eps): observations = env.reset() for t in range(max_timesteps): actions, logprobs = model.choose_action(observations) observations_, rewards, dones, infos = env.step( actions) # from discrete to multibinary action if render: env.render() model.put_data((observations, actions, rewards, observations_, logprobs, dones)) observations = observations_ for agent_name in env.agents: score[agent_name] += rewards[agent_name] if np.any(np.array(list(dones.values())) ): # any agent has a done -> terminate episode break # if not env.agents: # according to official docu (https://www.pettingzoo.ml/api), single agent will be removed if it recieved done, while others remain # break model.train_net() epi_len.append(t) if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}".format(n_epi)) record_score, record_length = {}, {} for agent_name in env.agents: avg_score = score[agent_name] / float(print_interval) avg_length = int(np.mean(epi_len)) print( "id : {}, agent :{}, avg score : {:.3f}, avg epi length : {}" .format(id, agent_name, avg_score, avg_length)) record_score[agent_name] = avg_score record_length[agent_name] = avg_length writer.add_scalars("ID {}/Scores".format(id), record_score, n_epi) writer.add_scalars("ID {}/Episode Length".format(id), record_length, n_epi) score = {a: 0.0 for a in env.agents} epi_len = [] if n_epi % save_interval == 0 and n_epi != 0: model.save_model('model/mappo_mp') model.save_model('model/mappo_mp')
def main(): args = get_args() log_dir = create_log_dir(args) if not args.test: writer = SummaryWriter(log_dir) else: writer = None SEED = 721 if args.ram_obs or args.env == "slimevolley_v0": obs_type = 'ram' else: obs_type = 'rgb_image' env = make_env(args.env, SEED, obs_type=obs_type) state_spaces = env.observation_spaces action_spaces = env.action_spaces print('state_spaces: ', state_spaces, ', action_spaces: ', action_spaces) learner_args = {'device': args.device} env.reset() print(env.agents) agents = env.agents if args.train_both: fixed_agents = [] else: fixed_agents = [ 'first_0' ] # SlimeVolley: opponent is the first, the second agent is the learnable one if obs_type == 'ram': model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP', fixed_agents, learner_args, **hyperparams).to(args.device) else: # model = PPODiscrete(state_space, action_space, 'CNN', learner_args, **hyperparams).to(device) model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN', fixed_agents, learner_args, **hyperparams).to(args.device) load_model(model, args) for individual_model in model.agents.values(): individual_model.policy.share_memory() individual_model.policy_old.share_memory() individual_model.value.share_memory() ShareParameters(individual_model.optimizer) path = 'model/' + args.env os.makedirs(path, exist_ok=True) if args.fictitious: path = path + '/fictitious_' processes = [] for p in range(args.num_envs): process = Process(target=parallel_rollout, args=(p, args.env, model, writer, max_eps, \ max_timesteps, selfplay_interval,\ args.render, path, args.against_baseline, \ args.selfplay, args.fictitious, SEED)) # the args contain shared and not shared process.daemon = True # all processes closed when the main stops processes.append(process) [p.start() for p in processes] [p.join() for p in processes] # finished at the same time env.close()
def main(): args = get_args() log_dir = create_log_dir(args) if not args.test: writer = SummaryWriter(log_dir) else: writer = None SEED = 721 if args.ram_obs or args.env == "slimevolley_v0": obs_type = 'ram' else: obs_type = 'rgb_image' # env = make_env(args.env, SEED, obs_type=obs_type) VectorEnv = [ DummyVectorEnv, SubprocVectorEnv ][1] # https://github.com/thu-ml/tianshou/blob/master/tianshou/env/venvs.py envs = VectorEnv([ lambda: make_env(args.env, obs_type=obs_type) for _ in range(args.num_envs) ]) envs.seed(np.random.randint(1000, size=args.num_envs).tolist()) # random seeding state_spaces = envs.observation_spaces[ 0] # same for all env instances, so just take one action_spaces = envs.action_spaces[ 0] # same for all env instances, so just take one print('state_spaces: ', state_spaces, ', action_spaces: ', action_spaces) learner_args = {'device': args.device} envs.reset() agents = envs.agents[0] # same for all env instances, so just take one print('agents: ', agents) if args.train_both: fixed_agents = [] else: fixed_agents = [ 'first_0' ] # SlimeVolley: opponent is the first, the second agent is the learnable one if obs_type == 'ram': model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces, action_spaces, 'MLP', fixed_agents, learner_args, **hyperparams).to(args.device) else: model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces, action_spaces, 'CNN', fixed_agents, learner_args, **hyperparams).to(args.device) load_model(model, args) path = f'model/{args.env}/' os.makedirs(path, exist_ok=True) if args.fictitious: path = path + 'fictitious_' parallel_rollout(envs, model, writer, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\ render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \ fictitious=args.fictitious, test=args.test, args=args) envs.close()
next_state_values = tf.where(dones == 0, next_state_values, tf.zeros_like(next_state_values)) expected_state_action_values = next_state_values * GAMMA + rewards_v return tf.losses.mean_squared_error(state_action_values, expected_state_action_values) if __name__ == "__main__": writer = tf.contrib.summary.create_file_writer( logdir='runs', flush_millis=10000, filename_suffix="-dqn-turtlebot3-followline") env = wrappers.make_env('Turtlebot3FollowLineCameraEnv-v0') buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward = None state = env.reset() net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
def main(): parser = argparse.ArgumentParser(description='Train or test arguments.') parser.add_argument('--env', type=str, help='Environment', required=True) parser.add_argument('--ram', dest='ram_obs', action='store_true', default=False) parser.add_argument('--render', dest='render', action='store_true', help='Enable openai gym real-time rendering') parser.add_argument('--seed', dest='seed', type=int, default=1234, help='Random seed') parser.add_argument('--load_agent', dest='load_agent', type=str, default=None, help='Load agent models by specifying: 1, 2, or both') parser.add_argument('--against_baseline', dest='against_baseline', action='store_true', default=False) parser.add_argument('--fictitious', dest='fictitious', action='store_true', default=False) args = parser.parse_args() if args.ram_obs or args.env == "slimevolley_v0": obs_type = 'ram' else: obs_type = 'rgb_image' env = make_env(args.env, args.seed, obs_type=obs_type) exploit_eps = 1000 # episodes to train the exploiter evaluate_eps = 10 # evaluate the exploiter after training state_spaces = env.observation_spaces action_spaces = env.action_spaces print('state_spaces: ', state_spaces, ', action_spaces: ', action_spaces) device_idx = 0 device = torch.device( "cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") learner_args = {'device': device} env.reset() print(env.agents) agents = env.agents fixed_agents = [ 'second_0' ] # both the model and exploiter fix the second agent, so the first agent of exploiter can learn if obs_type == 'ram': model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP', fixed_agents, learner_args, **hyperparams).to(device) exploiter_ = copy.deepcopy(model) else: model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN', fixed_agents, learner_args, **hyperparams).to(device) exploiter_ = copy.deepcopy(model) if args.fictitious: model_dir = 'model/{}/fictitious_selfplay/'.format(args.env) exploiter_dir = 'model/{}/fictitious_selfplay/exploiter/'.format( args.env) else: model_dir = 'model/{}/selfplay/noise/'.format(args.env) exploiter_dir = 'model/{}/selfplay/exploiter/noise/'.format(args.env) os.makedirs(model_dir, exist_ok=True) os.makedirs(exploiter_dir, exist_ok=True) # Parse all models saved during training in order filelist, epi_list = [], [] for filename in os.listdir(model_dir): if filename.endswith("policy"): filelist.append('_'.join( filename.split('_')[:-1])) # remove '_policy' at end epi_list.append(int(filename.split('mappo')[0])) sort_idx = np.argsort(epi_list).tolist() filelist = [x for _, x in sorted(zip(epi_list, filelist)) ] # sort filelist according to the sorting of epi_list epi_list.sort() # filelist.sort() will not give correct answer print(epi_list) # Evaluate/exploit all models saved during training in order eval_data = {} for f, i in zip(filelist, epi_list): print('load model: ', i, f) print(model_dir + f) exploiter = copy.deepcopy(exploiter_) model.load_model(agent_name=fixed_agents[0], path=model_dir + f) exploiter_path = exploiter_dir + f r, l = exploit(env, model, exploiter, exploit_eps=exploit_eps, eval_eps=evaluate_eps, max_timesteps=max_timesteps,\ render=args.render, exploiter_path=exploiter_path, against_baseline=args.against_baseline) print(f"Evaluate Avg. Reward: {r}, Avg. Length: {l}") eval_data[str(i)] = [r, l] save_dir = 'data/{}/'.format(args.env) os.makedirs(save_dir, exist_ok=True) if args.fictitious: save_dir += '/fictitious_eval_data.npy' else: save_dir += '/eval_data.npy' np.save(save_dir, eval_data) env.close()
def main(): parser = argparse.ArgumentParser(description='Train or test arguments.') parser.add_argument('--train', dest='train', action='store_true', default=False) parser.add_argument('--test', dest='test', action='store_true', default=False) parser.add_argument('--env', type=str, help='Environment', required=True) parser.add_argument('--ram', dest='ram_obs', action='store_true', default=False) parser.add_argument('--render', dest='render', action='store_true', help='Enable openai gym real-time rendering') parser.add_argument('--seed', dest='seed', type=int, default=1234, help='Random seed') parser.add_argument('--load_agent', dest='load_agent', type=str, default=None, help='Load agent models by specifying: 1, 2, or both') parser.add_argument('--against_baseline', dest='against_baseline', action='store_true', default=False) parser.add_argument('--fictitious', dest='fictitious', action='store_true', default=False) args = parser.parse_args() SEED = np.random.randint(1000) if args.ram_obs or args.env == "slimevolley_v0": obs_type = 'ram' else: obs_type = 'rgb_image' env = make_env(args.env, SEED, obs_type=obs_type) # max_eps = 500000 # max_timesteps = 10000 # selfplay_interval = 3000 # interval in a unit of episode to checkpoint a policy and replace its opponent in selfplay eval_eps = 100 state_spaces = env.observation_spaces action_spaces = env.action_spaces print('state_spaces: ', state_spaces, ', action_spaces: ', action_spaces) device_idx = 0 device = torch.device( "cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") learner_args = {'device': device} env.reset() print(env.agents) agents = env.agents fixed_agents = [ 'first_0', 'second_0' ] # SlimeVolley: opponent is the first, the second agent is the learnable one if obs_type == 'ram': model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP', fixed_agents, learner_args, **hyperparams).to(device) else: # model = PPODiscrete(state_space, action_space, 'CNN', learner_args, **hyperparams).to(device) model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN', fixed_agents, learner_args, **hyperparams).to(device) if args.fictitious: model_dir = 'model/{}/fictitious_selfplay/'.format(args.env) else: model_dir = 'model/{}/selfplay/'.format(args.env) os.makedirs(model_dir, exist_ok=True) filelist, epi_list = [], [] for filename in os.listdir(model_dir): if filename.endswith("policy"): filelist.append('_'.join( filename.split('_')[:-1])) # remove '_policy' at end epi_list.append(int(filename.split('mappo')[0])) sort_idx = np.argsort(epi_list).tolist() filelist = [x for _, x in sorted(zip(epi_list, filelist)) ] # sort filelist according to the sorting of epi_list epi_list.sort() # filelist.sort() will not give correct answer print(epi_list) r_list, l_list = [], [] eval_data = {} for f, i in zip(filelist, epi_list): print('episode: ', i, f) # if i>17000: print(model_dir + f) model.load_model(agent_name='second_0', path=model_dir + f) r, l = parallel_rollout(env, model, max_eps=eval_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\ render=args.render, model_path=None, against_baseline=args.against_baseline) eval_data[str(i)] = [r, l] save_dir = 'data/{}'.format(args.env) os.makedirs(save_dir, exist_ok=True) if args.fictitious: save_dir += '/fictitious_eval_data.npy' else: save_dir += '/eval_data.npy' np.save(save_dir, eval_data) env.close()
def main(): args = get_args() log_dir = create_log_dir(args) if not args.test: writer = SummaryWriter(log_dir) else: writer = None SEED = 721 if args.ram_obs or args.env == "slimevolley_v0": obs_type = 'ram' else: obs_type = 'rgb_image' env = make_env( args.env, SEED, obs_type=obs_type ) # TODO used for providing spaces info, can also modify SubprocVecEnv wrapper # https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html?highlight=multiprocessing envs = SubprocVecEnv([ lambda: make_env(args.env, obs_type=obs_type) for _ in range(args.num_envs) ], start_method='spawn') # envs.seed(np.random.randint(1000, size=args.num_envs).tolist()) # random seeding envs.seed(SEED) # fix seeding state_spaces = env.observation_spaces action_spaces = env.action_spaces print('state_spaces: ', state_spaces, ', action_spaces: ', action_spaces) learner_args = {'device': args.device} env.reset() agents = env.agents print('agents: ', agents) if args.train_both: fixed_agents = [] else: fixed_agents = [ 'first_0' ] # SlimeVolley: opponent is the first, the second agent is the learnable one if obs_type == 'ram': model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces, action_spaces, 'MLP', fixed_agents, learner_args, **hyperparams).to(args.device) else: model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces, action_spaces, 'CNN', fixed_agents, learner_args, **hyperparams).to(args.device) load_model(model, args) path = f"model/{args.env}/" os.makedirs(path, exist_ok=True) if args.fictitious: path = path + 'fictitious_' parallel_rollout(envs, model, writer, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\ render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \ fictitious=args.fictitious, test=args.test) envs.close()
def main(): args = get_args() print_args(args) log_dir = create_log_dir(args) if not args.test: writer = SummaryWriter(log_dir) else: writer = None SEED = 721 if args.ram_obs or args.env == "slimevolley_v0": obs_type = 'ram' else: obs_type = 'rgb_image' env = make_env(args.env, SEED, obs_type=obs_type) state_spaces = env.observation_spaces action_spaces = env.action_spaces print('state_spaces: ', state_spaces, ', action_spaces: ', action_spaces) learner_args = {'device': args.device} env.reset() print(env.agents) agents = env.agents if args.train_both: fixed_agents = [] else: fixed_agents = [ 'first_0' ] # SlimeVolley: opponent is the first, the second agent is the learnable one path = f"model/{args.env}/" os.makedirs(path, exist_ok=True) data_path = f"data/{args.env}/" os.makedirs(data_path, exist_ok=True) if obs_type == 'ram': model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP', fixed_agents, learner_args, **hyperparams).to(args.device) else: # model = PPODiscrete(state_space, action_space, 'CNN', learner_args, **hyperparams).to(device) model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN', fixed_agents, learner_args, **hyperparams).to(args.device) path = path + 'cnn_' if args.selfplay: os.makedirs(path + 'selfplay/', exist_ok=True) load_model(model, args) if args.fictitious: path = path + 'fictitious_' eval_env = make_env(args.env, np.random.randint(0, 100), obs_type=obs_type) evaluater = Evaluater(eval_env, max_timesteps) parallel_rollout(env, model, writer, evaluater=evaluater, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\ render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \ fictitious=args.fictitious, test=args.test) env.close()
next_state_values = tf.where(dones == 0, next_state_values, tf.zeros_like(next_state_values)) expected_state_action_values = next_state_values * GAMMA + rewards_v return tf.losses.mean_squared_error(state_action_values, expected_state_action_values) if __name__ == "__main__": writer = tf.contrib.summary.create_file_writer( logdir='runs', flush_millis=10000, filename_suffix="-dqn-f1-followline") env = wrappers.make_env('F1FollowLineCameraEnv-v0') env.load_checkpoints('adjusted_checkpoints.json') buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward = None state = env.reset() net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)