def create_environment(self): """ Create environment instance :return: environment (gym interface), env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions """ # load scenario from script if self.arglist.partial_observable: scenario = scenarios.load(self.arglist.scenario + "_partial_observable.py").POScenario() elif self.arglist.observation_noise: scenario = scenarios.load(self.arglist.scenario + "_observation_noise.py").ONScenario() elif self.arglist.environment_noise: scenario = scenarios.load(self.arglist.scenario + "_env_noise.py").ENScenario() else: scenario = scenarios.load(self.arglist.scenario + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) env_name = "mape" task_name = "mape_" + self.arglist.scenario n_agents = env.n print("Observation spaces: ", [env.observation_space[i] for i in range(n_agents)]) print("Action spaces: ", [env.action_space[i] for i in range(n_agents)]) observation_sizes = self.extract_sizes(env.observation_space) action_sizes = self.extract_sizes(env.action_space) discrete_actions = True return ( env, env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions, )
def get_env(self, world, reset_callback=None, reward_callback=None, observation_callback=None, info_callback=None, done_callback=None, shared_viewer=True, discrete_action_space=True): return MultiAgentEnv(world, reset_callback, reward_callback, observation_callback, info_callback, done_callback, shared_viewer, discrete_action_space=discrete_action_space)
def create_env(env_name): if env_name not in all_envs(): raise RuntimeError("Invalid multi-agent environment: " + env_name) # load scenario from script scenario = scenarios.load(env_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv( world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=False, ) return env
def make_env(args): """Make multi-agent particle environment Ref: https://github.com/openai/maddpg/blob/master/experiments/train.py """ scenario = scenarios.load(args.env_name + ".py").Scenario() world = scenario.make_world() done_callback = None env = MultiAgentEnv( world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, done_callback=done_callback) assert env.discrete_action_space is False, "For cont. action, this flag must be False" return env
def create_env(env_name): from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios # load scenario from script scenario = scenarios.load(env_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv( world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=False, ) return env
def make_multiagent_env(env_id, num_agents, dist_threshold, arena_size, identity_size): scenario = scenarios.load(env_id + ".py").Scenario( num_agents=num_agents, dist_threshold=dist_threshold, arena_size=arena_size, identity_size=identity_size) world = scenario.make_world() env = MultiAgentEnv( world=world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, info_callback=scenario.info if hasattr(scenario, 'info') else None, discrete_action=True, done_callback=scenario.done, cam_range=arena_size) return env
def make_env(args): # set scenario scenario = scenarios.load(args['scenario'] + ".py").Scenario() world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) # both good and bad agents args['n_players'] = env.n # train only good agents args['n_agents'] = env.n - args['n_enemies'] # obtain shapes of inodividual obs of agents args['obs_shape'] = [ env.observation_space[i].shape[0] for i in range(args['n_agents']) ] action_shape = [] for content in env.action_space: action_shape.append(content.n) args['action_shape'] = action_shape[:args['n_agents']] args['high_action'] = 1 args['low_action'] = -1 return env, args
def make_env(args): """Load multi-agent particle environment This code is modified from: https://github.com/openai/maddpg/blob/master/experiments/train.py """ # Check github branch check_github(path="./thirdparty/multiagent-particle-envs", branch_name="opponent") # Load multi-agent particle env scenario = scenarios.load(args.env_name + ".py").Scenario() world = scenario.make_world() done_callback = None env = MultiAgentEnv(world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, done_callback=done_callback) assert env.discrete_action_space is False, "For cont. action, this flag must be False" return env
def make_env(scenario_name, args): from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # set fixed partition if args.partition != 'rand': print('>> Ensemble Partition Type = {}!'.format(args.partition)) scenario.partition = args.partition if args.partition_flag is not None: print('>> Partition Flag = {}!'.format(args.partition_flag)) scenario.partition_flag = args.partition_flag if args.evaluate and args.measure_success: print('>> Evaluating Success Rate!') scenario.measure_success = True # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) return env
def make_env(scenario_name, arglist, benchmark=False): from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios global scenario # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment if benchmark: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) else: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) env.window_pos = 'right' env.force_discrete_action = False return env
def train(scenario): path_to_save = 'models/' + scenario.__module__.split('.')[-1] + '/simple' if not os.path.exists(path_to_save): os.makedirs(path_to_save) world = scenario.make_world() env = MultiAgentEnv( world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, info_callback=None, done_callback=scenario.done, shared_viewer=True, ) with U.single_threaded_session() as sess: simple_agents = [VectorAgent(env, 0, 1), StayAgent(env, 1)] # good agent evaluator = evaluate_models.Evaluator(args, scenario, save=scenario.name + '/' + str(1)) evaluator.evaluate(env, simple_agents, 0)
def _make_env(scenario_name, horizon, monitor_enabled, video_frequency): if scenario_name in CUSTOM_SCENARIOS: # Scenario file must exist locally file_path = os.path.join(os.path.dirname(__file__), scenario_name + '.py') scenario = imp.load_source('', file_path).Scenario() else: scenario = scenarios.load(scenario_name + '.py').Scenario() world = scenario.make_world() env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) env.metadata['video.frames_per_second'] = 8 env = ParticleEnvRenderWrapper(env, horizon) if not monitor_enabled: return env return wrappers.Monitor(env, './logs/videos', resume=True, video_callable=_video_callable(video_frequency))
def train(scenario): train_n = 0 world = scenario.make_world() env = MultiAgentEnv(world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, info_callback=None, done_callback=scenario.done, collision_callback=scenario.is_collision, shared_viewer=True, ) evaluator = evaluate_models.Evaluator(args, scenario, save=scenario.name + '/' + str(train_n)) simple_agents = [StayAgent(env, 1), VectorAgent(env,0,1)] #good agent policies = [simple_agents[1], simple_agents[0]] print('agents is created') statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["q_{}".format(i) for i in range(env.n)]) statistics = utilities.Time_Series_Statistics_Store( statistics_header) statistics.dump("{}_{}.csv".format( args.experiment_prefix + scenario.__module__.split('.')[-1], 0)) evaluator.evaluate(env, policies, 0)
def make_env(): from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios args = parse_args() # load scenario from script scenario = scenarios.load(args.scenario + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=True) args.action_dimension = env.action_space[0].n args.observation_dimension = env.observation_space[0].shape[0] args.low_action = 0 args.high_action = 1 return env, args
def make_env(scenario_name, arglist, benchmark=False): global scenario from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() scenario.individual_reward = arglist.indv_rew scenario.cooperative_reward = arglist.coop_rew scenario.crash_punishment = arglist.crash_pun # create world world = scenario.make_world() # create multiagent environment if benchmark: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) else: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) env.window_pos = 'left' env.force_discrete_action = False return env
def make_env(scenario_name, benchmark=False): ''' Creates a MultiAgentEnv object as env. This can be used similar to a gym environment by calling env.reset() and env.step(). Use env.render() to view the environment on the screen. Input: scenario_name : name of the scenario from ./scenarios/ to be Returns (without the .py extension) benchmark : whether you want to produce benchmarking data (usually only done during evaluation) Some useful env properties (see environment.py): .observation_space : Returns the observation space for each agent .action_space : Returns the action space for each agent .n : Returns the number of Agents ''' from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment world.dim_c = 0 if benchmark: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) else: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) env.discrete_action_space = False env.discrete_action_input = False scenario.reset_world(world) return env, scenario, world
with torch.no_grad(): prediction = net(obs, actions) loss = F.mse_loss(torch.tensor(next_obs).to(args.device), prediction).item() net.train() return loss if __name__ == '__main__': args = parse_commandline() print(args) scenario = scenarios.load(args.scenario).Scenario() world = scenario.make_world() env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=False) buffer, eval_set = create_buffer(env, size=args.buffer_size) with open('buffer.pkl', 'wb') as dst: pickle.dump([buffer, eval_set], dst) with open('buffer.pkl', 'rb') as src: buffer, eval_set = pickle.load(src) print(f'Buffer created with size {len(buffer)}') net = Network() net.to(args.device) # net.load('model.torch') net = train(net, buffer, eval_set, args) net.save('model.torch')
# parse arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') args = parser.parse_args() # load scenario from script scenario = scenarios.load(args.scenario).Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=False) # 定义使用 DQN 的算法 RL = DeepQNetwork( n_actions=env.action_space[0].n, n_features=env.observation_space[0].shape[0], learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=0.0008, ) total_steps = 0
import numpy as np from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios from keras.models import load_model from time import sleep # load scenario from script scenario = scenarios.load("simple" + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) nb_agent = len(env.agents) action_space_n = env.action_space observation_space_n = env.observation_space agents = [load_model("../agent-" + str(i) + ".model") for i in range(nb_agent)] while True: state_n = env.reset() action_space_n = env.action_space env.discrete_action_input = True state_n = [ np.reshape(state_n[i], (1, observation_space_n[i].shape[0])) for i in range(nb_agent)
import multiagent.scenarios as scenarios sys.path.insert(1, os.path.join(sys.path[0], '..')) if __name__ == '__main__': # parse arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') args = parser.parse_args() # load scenario from script scenario = scenarios.load(args.scenario).Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False) # render call to create viewer window (necessary only for interactive policies) env.render() # create interactive policies for each agent policies = [InteractivePolicy(env,i) for i in range(env.n)] # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i])) # step environment obs_n, reward_n, done_n, _ = env.step(act_n) # render all agent views env.render()
from multiagent.environment import MultiAgentEnv from multiagent.policy import InteractivePolicy import multiagent.scenarios as scenarios if __name__ == '__main__': # parse arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') args = parser.parse_args() # load scenario from script scenario = scenarios.load(args.scenario).Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False) # render call to create viewer window (necessary only for interactive policies) env.render() # create interactive policies for each agent policies = [InteractivePolicy(env,i) for i in range(env.n)] # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i])) # step environment obs_n, reward_n, done_n, _ = env.step(act_n) # render all agent views env.render()
import pickle from maddpg import MADDPG # load the setting of the environment. scenario = scenarios.load( '/home/zw/lyy/maddpg/multiagent-particle-envs/multiagent/scenarios/simple_tag_non_adv_4.py' ).Scenario() output = open('data_saq_test.pkl', 'wb') # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=True) # world.train_or_test = True n_agents = env.n # some initial training parameters n_actions = world.dim_p # the capacity of the experience memory capacity = 1000000 batch_size = 1000 totalTime = 0 n_episode = 3000 max_steps = 100 # before training, we will store the experience of all agents' state information for the next training process.
def main(): # Experiment Configuration episodes = 5000 steps_per_episode = 200 output_dir = '../data/' # Load the simulation scenario scenario = scenarios.load("decentralized_safe.py").Scenario() world = scenario.make_world() # Environment Setup env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, constraint_callback=scenario.constraints, shared_viewer=True) # The scenario parameters env_params = env.get_env_parameters() state_dim = env_params["state_dim"] action_dim = env_params["act_dim"] constraint_dim = env_params["constraint_dim"] num_agents = env_params["num_agents"] # Data Storage Containers size = episodes * (steps_per_episode - 1) state_buf = np.zeros([size, state_dim * num_agents]) action_buf = np.zeros([size, action_dim * num_agents]) constraint_diff = np.zeros([size, constraint_dim * num_agents]) # Simulate the environment and generate dataset for constraints networks for episode in range(episodes): print(f'episode={episode}') # Episode "Preprocessing" state = env.reset() constraint_old = np.zeros([constraint_dim]) for step in range(steps_per_episode): # Simulation action = np.random.uniform(-1, 1, action_dim * num_agents) action = np.split(action, num_agents) # Deep Copy the agent's action (otherwise it's altered in env.step()) action_copy = copy.deepcopy(action) next_state, reward, _, _, constraint = env.step(action_copy) # Omit first simulation step if step == 0: constraint_old = constraint continue # Constraint diff diff = list(map(operator.sub, constraint, constraint_old)) constraint_old = constraint # Store stuff to buffers for training idx = episode * (steps_per_episode - 1) + step - 1 state_buf[idx, :] = np.concatenate(state) action_buf[idx, :] = np.concatenate(action) constraint_diff[idx, :] = np.concatenate(diff) # update state state = next_state # Export Results for training if not os.path.exists(output_dir): os.makedirs(output_dir) pd.DataFrame(state_buf).to_csv(output_dir + "D_state_decentralized.csv") pd.DataFrame(action_buf).to_csv(output_dir + "D_action_decentralized.csv") pd.DataFrame(constraint_diff).to_csv(output_dir + "D_constraint_decentralized.csv") print("Done... Data saved")
'''define the scenario name''' scenario_name = 'simple_spread' '''define the special property''' # independentArgs = namedtuple( 'independentArgs', [] ) aux_args = AuxArgs[model_name]() alias = '_new_6' '''load scenario from script''' scenario = scenario.load(scenario_name+".py").Scenario() '''create world''' world = scenario.make_world() '''create multiagent environment''' env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=True) env = GymWrapper(env) MergeArgs = namedtuple('MergeArgs', Args._fields+AuxArgs[model_name]._fields) # under offline trainer if set batch_size=replay_buffer_size=update_freq -> epoch update args = Args(model_name=model_name, agent_num=env.get_num_of_agents(), hid_size=32, obs_size=np.max(env.get_shape_of_obs()), continuous=False, action_dim=np.max(env.get_output_shape_of_act()), init_std=0.1, policy_lrate=1e-3, value_lrate=1e-2, max_steps=200,
def train_function(config): # ----------- Alg parameters ----------------- # experiment = config['experiment'] if experiment == "particle": scenario_name = config['scenario'] seed = config['seed'] np.random.seed(seed) random.seed(seed) # Curriculum stage stage = config['stage'] port = config['port'] dir_name = config['dir_name'] dir_restore = config['dir_restore'] use_alg_credit = config['use_alg_credit'] use_qmix = config['use_qmix'] use_Q_credit = config['use_Q_credit'] # If 1, then uses Q-net and global reward use_Q = config['use_Q'] use_V = config['use_V'] if experiment == "sumo": dimensions = config['dimensions_sumo'] elif experiment == "particle": dimensions = config['dimensions_particle'] # If 1, then restores variables from same stage restore_same_stage = config['restore_same_stage'] # If 1, then does not restore variables, even if stage > 1 train_from_nothing = config['train_from_nothing'] # Name of model to restore model_name = config['model_name'] # Total number of training episodes N_train = config['N_train'] period = config['period'] # Number of evaluation episodes to run every <period> N_eval = config['N_eval'] summarize = config['summarize'] alpha = config['alpha'] lr_Q = config['lr_Q'] lr_V = config['lr_V'] lr_actor = config['lr_actor'] dual_buffer = config['dual_buffer'] buffer_size = config['buffer_size'] threshold = config['threshold'] batch_size = config['batch_size'] pretrain_episodes = config['pretrain_episodes'] steps_per_train = config['steps_per_train'] max_steps = config['max_steps'] # Probability of using random configuration prob_random = config['prob_random'] epsilon_start = config['epsilon_start'] epsilon_end = config['epsilon_end'] epsilon_div = config['epsilon_div'] epsilon_step = (epsilon_start - epsilon_end) / float(epsilon_div) if experiment == "sumo": # ----------- SUMO parameters ---------------- # with open('config_sumo_stage%d.json' % stage) as f: config_sumo = json.load(f) n_agents = config_sumo["n_agents"] list_goals_fixed = config_sumo['goal_lane'] list_routes_fixed = config_sumo['route'] list_lanes_fixed = config_sumo['lane'] list_goal_pos = config_sumo['goal_pos'] list_speeds = config_sumo['speed'] init_positions = config_sumo['init_position'] list_id = config_sumo['id'] list_vtypes = config_sumo['vtypes'] depart_mean = config_sumo['depart_mean'] depart_stdev = config_sumo['depart_stdev'] total_length = config_sumo['total_length'] total_width = config_sumo['total_width'] save_threshold = config_sumo['save_threshold'] map_route_idx = {'route_ramp': 0, 'route_straight': 1} sim = sumo_simulator.Simulator(port, list_id=list_id, other_lc_mode=0b1000000001, sublane_res=0.8, seed=seed) for i in range(int(2 / sim.dt)): sim.step() elif experiment == 'particle': with open(config["particle_config"]) as f: config_particle = json.load(f) n_agents = config_particle['n_agents'] scenario = scenarios.load(scenario_name + ".py").Scenario() world = scenario.make_world(n_agents, config_particle, prob_random) env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, None, scenario.done, max_steps=max_steps) elif experiment == 'checkers': with open("config_checkers_stage%d.json" % stage) as f: config_checkers = json.load(f) n_agents = config_checkers['n_agents'] dimensions = config_checkers['dimensions'] init = config_checkers['init'] env = checkers.Checkers(init['n_rows'], init['n_columns'], init['n_obs'], init['agents_r'], init['agents_c'], n_agents, max_steps) l_action = dimensions['l_action'] l_goal = dimensions['l_goal'] # Create entire computational graph # Creation of new trainable variables for new curriculum # stage is handled by networks.py, given the stage number if use_alg_credit: if experiment == 'checkers': alg = alg_credit_checkers.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q_credit=use_Q_credit, use_V=use_V, nn=config_checkers['nn']) else: alg = alg_credit.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q_credit=use_Q_credit, use_V=use_V, nn=config['nn']) elif not use_qmix: if experiment == 'checkers': alg = alg_baseline_checkers.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q=use_Q, use_V=use_V, alpha=alpha, nn=config_checkers['nn'], IAC=config['IAC']) else: alg = alg_baseline.Alg(experiment, dimensions, stage, n_agents, lr_V=lr_V, lr_Q=lr_Q, lr_actor=lr_actor, use_Q=use_Q, use_V=use_V, alpha=alpha, nn=config['nn'], IAC=config['IAC']) else: print("Using QMIX") if experiment == 'checkers': alg = alg_qmix_checkers.Alg(experiment, dimensions, stage, n_agents, lr_Q=lr_Q, nn=config_checkers['nn']) else: alg = alg_qmix.Alg(experiment, dimensions, stage, n_agents, lr_Q=lr_Q) print("Initialized computational graph") list_variables = tf.trainable_variables() if stage == 1 or restore_same_stage or train_from_nothing: saver = tf.train.Saver() elif stage == 2: # to_restore = [v for v in list_variables if ('stage-%d'%stage not in v.name.split('/') and 'Policy_target' not in v.name.split('/'))] to_restore = [] for v in list_variables: list_split = v.name.split('/') if ('stage-%d' % stage not in list_split ) and ('Policy_target' not in list_split) and ( 'Q_credit_main' not in list_split) and ('Q_credit_target' not in list_split): to_restore.append(v) saver = tf.train.Saver(to_restore) else: # restore only those variables that were not # just created at this curriculum stage to_restore = [ v for v in list_variables if 'stage-%d' % stage not in v.name.split('/') ] saver = tf.train.Saver(to_restore) config = tf.ConfigProto() config.gpu_options.allow_growth = True tf.set_random_seed(seed) sess = tf.Session(config=config) writer = tf.summary.FileWriter('../saved/%s' % dir_name, sess.graph) sess.run(tf.global_variables_initializer()) print("Initialized variables") if train_from_nothing == 0: print("Restoring variables from %s" % dir_restore) saver.restore(sess, '../saved/%s/%s' % (dir_restore, model_name)) if stage == 2 and use_alg_credit and use_Q_credit: # Copy weights of Q_global to Q_credit at the start of Stage 2 sess.run(alg.list_initialize_credit_ops) for var in list_variables: if var.name == 'Q_global_main/Q_branch1/kernel:0': print("Q_global") print(sess.run(var)) print("") if var.name == 'Q_credit_main/Q_branch1/kernel:0': print("Q_credit") print(sess.run(var)) print("") # initialize target networks to equal main networks sess.run(alg.list_initialize_target_ops) # save everything without exclusion saver = tf.train.Saver(max_to_keep=None) epsilon = epsilon_start # For computing average over 100 episodes reward_local_century = np.zeros(n_agents) reward_global_century = 0 # Write log headers header = "Episode,r_global" header_c = "Century,r_global_avg" for idx in range(n_agents): header += ',r_%d' % idx header_c += ',r_avg_%d' % idx header_c += ",r_global_eval" for idx in range(n_agents): header_c += ',r_eval_%d' % idx if experiment == 'sumo': for idx in range(n_agents): header += ',route_%d,lane_%d,goal_%d' % (idx, idx, idx) header_c += ',r_eval_local,duration (s)' header += '\n' header_c += '\n' if not os.path.exists('../log/%s' % dir_name): os.makedirs('../log/%s' % dir_name) with open('../log/%s/log.csv' % dir_name, 'w') as f: f.write(header) with open('../log/%s/log_century.csv' % dir_name, 'w') as f: f.write(header_c) if dual_buffer: buf = replay_buffer_dual.Replay_Buffer(size=buffer_size) else: buf = replay_buffer.Replay_Buffer(size=buffer_size) t_start = time.time() dist_action = np.zeros(l_action) step = 0 # Each iteration is a training episode for idx_episode in range(1, N_train + 1): # print("Episode", idx_episode) if experiment == "sumo": t_ms = sim.traci.simulation.getCurrentTime() # SUMO time functions return negative values afer 24 days (in millisecond) of simulation time # Hence use 0 for departure time, essentially triggering an immediate departure if 0 < t_ms and t_ms < 2073600e3: depart_times = [ np.random.normal(t_ms / 1000.0 + depart_mean[idx], depart_stdev) for idx in range(n_agents) ] else: depart_times = [0 for idx in range(n_agents)] # Goals for input to policy and value function goals = np.zeros([n_agents, l_goal]) list_routes = ['route_straight'] * n_agents list_lanes = [0] * n_agents list_goal_lane = [0] * n_agents rand_num = random.random() if rand_num < prob_random: # Random settings for route, lane and goal init = 'Random' for idx in range(n_agents): route = 'route_straight' lane = np.random.choice([0, 1, 2, 3], p=np.ones(4) * 0.25) goal_lane = np.random.choice(np.arange(l_goal), p=np.ones(l_goal) / float(l_goal)) list_routes[idx] = route list_lanes[idx] = lane list_goal_lane[idx] = goal_lane goals[idx, goal_lane] = 1 else: init = 'Preset' # Use predetermined values for route, lane, goal for idx in range(n_agents): list_routes[idx] = list_routes_fixed[idx] goal_lane = list_goals_fixed[idx] list_goal_lane[idx] = goal_lane list_lanes[idx] = list_lanes_fixed[idx] goals[idx, goal_lane] = 1 env = multicar_simple.Multicar(sim, n_agents, list_goal_lane, list_goal_pos, list_routes, list_speeds, list_lanes, init_positions, list_id, list_vtypes, depart_times, total_length=total_length, total_width=total_width, safety=True) global_state, local_others, local_self, done = env.reset() elif experiment == "particle": global_state, local_others, local_self, done = env.reset() goals = np.zeros([n_agents, l_goal]) for idx in range(n_agents): goals[idx] = env.world.landmarks[idx].state.p_pos elif experiment == "checkers": if n_agents == 1: if np.random.randint(2) == 0: goals = np.array([[1, 0]]) else: goals = np.array([[0, 1]]) else: goals = np.eye(n_agents) global_state, local_others, local_self_t, local_self_v, done = env.reset( goals) actions_prev = np.zeros(n_agents, dtype=np.int) reward_global = 0 reward_local = np.zeros(n_agents) # step = 0 summarized = False if dual_buffer: buf_episode = [] while not done: if idx_episode < pretrain_episodes and (stage == 1 or train_from_nothing == 1): # Random actions when filling replay buffer actions = np.random.randint(0, l_action, n_agents) else: # Run actor network for all agents as batch if experiment == 'checkers': actions = alg.run_actor(actions_prev, local_others, local_self_t, local_self_v, goals, epsilon, sess) else: actions = alg.run_actor(local_others, local_self, goals, epsilon, sess) dist_action[actions[0]] += 1 if experiment == 'sumo': # check feasible actions actions = env.check_actions(actions) # step environment if experiment == 'checkers': next_global_state, next_local_others, next_local_self_t, next_local_self_v, reward, local_rewards, done = env.step( actions) else: next_global_state, next_local_others, next_local_self, reward, local_rewards, done = env.step( actions) step += 1 # store transition into memory if dual_buffer: if experiment == 'checkers': buf_episode.append( np.array([ global_state[0], global_state[1], np.array(local_others), np.array(local_self_t), np.array(local_self_v), actions_prev, actions, reward, local_rewards, next_global_state[0], next_global_state[1], np.array(next_local_others), np.array(next_local_self_t), np.array(next_local_self_v), done, goals ])) else: buf_episode.append( np.array([ global_state, np.array(local_others), np.array(local_self), actions, reward, local_rewards, next_global_state, np.array(next_local_others), np.array(next_local_self), done, goals ])) else: if experiment == 'checkers': buf.add( np.array([ global_state[0], global_state[1], np.array(local_others), np.array(local_self_t), np.array(local_self_v), actions_prev, actions, reward, local_rewards, next_global_state[0], next_global_state[1], np.array(next_local_others), np.array(next_local_self_t), np.array(next_local_self_v), done, goals ])) else: buf.add( np.array([ global_state, np.array(local_others), np.array(local_self), actions, reward, local_rewards, next_global_state, np.array(next_local_others), np.array(next_local_self), done, goals ])) if (idx_episode >= pretrain_episodes) and (step % steps_per_train == 0): # Sample batch of transitions from replay buffer batch = buf.sample_batch(batch_size) if summarize and idx_episode % period == 0 and not summarized: # Write TF summary every <period> episodes, # at the first <steps_per_train> step alg.train_step(sess, batch, epsilon, idx_episode, summarize=True, writer=writer) summarized = True else: alg.train_step(sess, batch, epsilon, idx_episode, summarize=False, writer=None) global_state = next_global_state local_others = next_local_others if experiment == 'checkers': local_self_t = next_local_self_t local_self_v = next_local_self_v actions_prev = actions else: local_self = next_local_self reward_local += local_rewards reward_global += reward if dual_buffer: if experiment == 'sumo': buf.add(buf_episode, np.sum(reward_local) < threshold) elif experiment == 'particle': buf.add(buf_episode, scenario.collisions != 0) if idx_episode >= pretrain_episodes and epsilon > epsilon_end: epsilon -= epsilon_step reward_local_century += reward_local reward_global_century += reward_global # ----------- Log performance --------------- # if idx_episode % period == 0: dist_action = dist_action / np.sum(dist_action) t_end = time.time() print("\n Evaluating") if experiment == 'sumo': r_local_eval, r_global_eval = evaluate.test( N_eval, sim, sess, depart_mean, depart_stdev, n_agents, l_goal, list_routes_fixed, list_lanes_fixed, list_goals_fixed, prob_random, list_goal_pos, list_speeds, init_positions, list_id, list_vtypes, alg) if np.all(r_local_eval > save_threshold): saver.save( sess, '../saved/%s/model_good_%d.ckpt' % (dir_name, idx_episode)) elif experiment == 'particle': r_local_eval, r_global_eval = evaluate.test_particle( N_eval, env, sess, n_agents, l_goal, alg, render=False) elif experiment == 'checkers': r_local_eval, r_global_eval = evaluate.test_checkers( N_eval, env, sess, n_agents, alg) if stage == 1 and np.sum(r_local_eval) > 9.0: saver.save( sess, '../saved/%s/model_good_%d.ckpt' % (dir_name, idx_episode)) s = '%d,%.2f,' % (idx_episode, reward_global_century / float(period)) s += ','.join([ '{:.2f}'.format(val / float(period)) for val in reward_local_century ]) s += ',%.2f,' % (r_global_eval) s += ','.join(['{:.2f}'.format(val) for val in r_local_eval]) s += ',%.2f,%d' % (np.sum(r_local_eval), int(t_end - t_start)) s += '\n' print(s) with open('../log/%s/log_century.csv' % dir_name, 'a') as f: f.write(s) reward_local_century = np.zeros(n_agents) reward_global_century = 0 print("Action distribution ", dist_action) if dual_buffer: print( "length buffer good %d, length buffer others %d, epsilon %.3f" % (len(buf.memory_2), len(buf.memory_1), epsilon)) else: print("epsilon %.3f" % epsilon) dist_action = np.zeros(l_action) t_start = time.time() s = '%d,%.2f,' % (idx_episode, reward_global) s += ','.join(['{:.2f}'.format(val) for val in reward_local]) if experiment == 'sumo': for idx in range(n_agents): s += ',%d,%d,%d' % (map_route_idx[list_routes[idx]], list_lanes[idx], list_goal_lane[idx]) s += '\n' with open('../log/%s/log.csv' % dir_name, 'a') as f: f.write(s) print("Saving stage %d variables" % stage) if not os.path.exists('../saved/%s' % dir_name): os.makedirs('../saved/%s' % dir_name) saver.save(sess, '../saved/%s/model_final.ckpt' % dir_name)
def make_env(scenario_name, local_observation=True, benchmark=False, discrete_action=True): ''' Creates a MultiAgentEnv object as env. This can be used similar to a gym environment by calling env.reset() and env.step(). Use env.render() to view the environment on the screen. Input: scenario_name : name of the scenario from ./scenarios/ to be Returns (without the .py extension) benchmark : whether you want to produce benchmarking data (usually only done during evaluation) Some useful env properties (see environment.py): .observation_space : Returns the observation space for each agent .action_space : Returns the action space for each agent .n : Returns the number of Agents simple_spread simple_reference simple_speaker_listener collect_treasure multi_speaker_listener ''' # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() if local_observation: if scenario_name == 'simple_spread': scenario.observation = local_obs_simple_spread.__get__(scenario) elif scenario_name == 'simple_reference': scenario.observation = local_obs_simple_reference.__get__(scenario) elif scenario_name == 'simple_speaker_listener': scenario.observation = local_obs_simple_speaker_listener.__get__( scenario) elif scenario_name == 'multi_speaker_listener': # scenario.observation = local_obs_multi_speaker_listener.__get__(scenario) print('origin') elif scenario_name == 'fullobs_collect_treasure': scenario.observation = local_obs_collect_treasure.__get__(scenario) else: print('error: unsupported scenario!') # create world world = scenario.make_world() world.collaborative = False # to get individual reward # create multiagent environment if hasattr(scenario, 'post_step'): post_step = scenario.post_step else: post_step = None if benchmark: env = MultiAgentEnv(world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, post_step_callback=post_step, info_callback=scenario.benchmark_data, discrete_action=discrete_action) else: env = MultiAgentEnv(world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, post_step_callback=post_step, discrete_action=discrete_action) env.force_discrete_action = True return env
AgentSpec(type=0), AgentSpec(type=1), AgentSpec(type=2), AgentSpec(type=0), AgentSpec(type=1), AgentSpec(type=2), AgentSpec(type=0), AgentSpec(type=1), AgentSpec(type=2) ], reward_type=reward_type, reward_type_string=visualization._type_string, shuffle_on_reset=True ) scenario = Scenario() world = scenario.make_world(env_spec) env = MultiAgentEnv( world, env_spec, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, done_callback=scenario.done, shared_viewer=True ) # env.viewers[0].cam_range = env_spec.zoom """ # interactive policy test policies = [InteractivePolicy(env, i) for i in range(env.n)] obs_n = env.reset() world.agents[0].state.p_pos = [0., 0.] while True: act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i]))
import pdb if __name__ == '__main__': import multiagent.scenarios as scenarios scenario = scenarios.load( "multiagent-particle-envs/multiagent/scenarios/simple_tag.py" ).Scenario() world = scenario.make_world() from multiagent.environment import MultiAgentEnv env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) n_agents = env.n dim_act = world.dim_p * 2 + 1 obs = env.reset() n_states = len(obs[0]) n_episode = 20000 max_steps = 1200 from maddpg import * maddpg = MADDPG(n_agents, n_states, dim_act) for i_episode in range(n_episode): obs = env.reset() obs = np.stack(obs) max_steps = 1200 total_reward = 0 adversaries_reward = 0 goodagent_reward = 0 for t in range(max_steps): actions = maddpg.produce_action(obs)
# parse arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') args = parser.parse_args() # load scenario from script scenario = scenarios.load(args.scenario).Scenario() # create world world = scenario.make_world() # world is a class of world # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=False) # render call to create viewer window (necessary only for interactive policies) env.render() # create interactive policies for each agent policies = [InteractivePolicy(env, i) for i in range(env.n) ] # create for each agent in the simulation world # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i])) # step environment
dropout = tf.layers.dropout( inputs=dense, rate=0.4, training=True) context = tf.layers.dense(inputs=dropout, units=num_outputs) return(context) def make_env(scenario_name, arglist, benchmark=False): from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment if benchmark: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) else: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) return env def get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist): trainers = [] model = mlp_model map_model=CNN_model trainer = MADDPGAgentTrainer for i in range(num_adversaries): trainers.append(trainer( "agent_%d" % i, model, map_model,obs_shape_n, obs_map_shape_n, env.action_space,i, arglist, local_q_func=(arglist.adv_policy=='ddpg'))) for i in range(num_adversaries, env.n): trainers.append(trainer(
def train(scenario): path_to_save = 'models/' + scenario.__module__.split('.')[-1] + '/ddpg' train_n = 1 if not os.path.exists(path_to_save): os.makedirs(path_to_save) world = scenario.make_world() env = MultiAgentEnv( world, reset_callback=scenario.reset_world, reward_callback=scenario.reward, observation_callback=scenario.observation, info_callback=None, done_callback=scenario.done, collision_callback=scenario.is_collision, shared_viewer=True, ) evaluator = evaluate_models.Evaluator(args, scenario, save=scenario.name + '/' + str(train_n)) with U.single_threaded_session() as sess: simple_agents = [StayAgent(env, 1)] #good agent agents_with_nn = [ DDPGAgent( env, 0, sess, batch_size=args.batch_size, memory_size=args.memory_size, noise_type=args.noise_type, # good agent actor_lr=args.actor_lr, critic_lr=args.critic_lr, layer_norm=True, nb_layers=args.nb_layers, nb_neurons=args.nb_neurons) ] policies = [agents_with_nn[0], simple_agents[0]] print('agents is created') # for agent in agents_with_nn: # agent.agent.initialize(sess) saver = tf.train.Saver() if args.load_weights: saver.restore(sess, 'models/' + scenario.name + '/ddpg/model') sess.graph.finalize() # for agent in agents_with_nn: # agent.agent.reset() statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["q_{}".format(i) for i in range(env.n)]) statistics = utilities.Time_Series_Statistics_Store(statistics_header) for episode in range(args.episodes): if episode % 500 == 0: print('episode ' + str(episode)) # reset for agent in policies: agent.reset() states = env.reset() step = 0 while True: episode_q = np.zeros(env.n) episode_rewards = np.zeros(env.n) step += 1 env_done = False # choose actions if args.render: env.render() actions = [None for _ in range(len(world.policy_agents))] for agent in simple_agents: actions[agent.agent_index] = (agent.action( states[agent.agent_index])) episode_q[0] += 0 for agent in agents_with_nn: action, q = agent.action(states[agent.agent_index], apply_noise=True, compute_Q=True) actions[agent.agent_index] = action episode_q[agent.agent_index] += q # step states_next, rewards, done, info = env.step(actions) episode_rewards += rewards # save to memory # print(rewards) for agent in agents_with_nn: agent.agent.store_transition( states[agent.agent_index], actions[agent.agent_index], rewards[agent.agent_index], states_next[agent.agent_index], done[agent.agent_index]) if step >= args.max_steps: env_done = True for agent in agents_with_nn: if done[agent.agent_index]: env_done = True states = states_next if env_done: episode_rewards = episode_rewards / step episode_losses = episode_q / step statistic = [episode] statistic.append(step) statistic.extend( [episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_q[i] for i in range(env.n)]) statistics.add_statistics(statistic) break # learn # Adapt param noise, if necessary. for t_train in range(args.nb_train_steps): for agent in agents_with_nn: if agent.agent.memory.nb_entries >= args.batch_size: if episode % args.param_noise_adaption_interval == 0: distance = agent.agent.adapt_param_noise() # print('train') cl, al = agent.agent.train() agent.agent.update_target_net() if episode % args.save_every_n_episodes == 0: saver.save( sess, 'models/' + scenario.__module__.split('.')[-1] + '/ddpg/model') if args.evaluate_every_n_episodes != 0 and episode % args.evaluate_every_n_episodes == 0: statistics.dump("{}_{}.csv".format( args.experiment_prefix + scenario.__module__.split('.')[-1], episode)) evaluator.evaluate(env, policies, episode) saver.save( sess, 'models/' + scenario.__module__.split('.')[-1] + '/ddpg/model')