def init_variables(self, info): # Here you have the information of the game (virtual init() in random_walk.cpp) # List: game_time, goal, number_of_robots, penalty_area, codewords, # robot_height, robot_radius, max_linear_velocity, field, team_info, # {rating, name}, axle_length, resolution, ball_radius # self.game_time = info['game_time'] self.field = info['field'] self.robot_size = 2 * info['robot_radius'] self.goal = info['goal'] self.max_linear_velocity = info['max_linear_velocity'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self.cur_my = [] self.cur_ball = [] self.state_dim = 2 # relative ball self.history_size = 2 # frame history size self.action_dim = 2 # 2 self.arglist = Argument() self.state_shape = (self.state_dim * self.history_size, ) # state dimension self.act_space = [Discrete(self.action_dim * 2 + 1)] self.trainers = MADDPGAgentTrainer('agent_moving', self.mlp_model, self.state_shape, self.act_space, 0, self.arglist, local_q_func=False) # for tensorboard self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = \ tf.summary.FileWriter('summary/moving_test', U.get_session().graph) U.initialize() # Load previous results, if necessary if self.arglist.load_dir == "": self.arglist.load_dir = self.arglist.save_dir if self.arglist.restore: print('Loading previous state... %s' % self.arglist.load_dir) U.load_state(self.arglist.load_dir) self.saver = tf.train.Saver(max_to_keep=1100) self.state = np.zeros([self.state_dim * self.history_size ]) # histories self.train_step = 216000 self.wheels = np.zeros(self.number_of_robots * 2) self.action = np.zeros(self.action_dim * 2 + 1) # not np.zeros(2) self.stats_steps = 6000 # for tensorboard self.rwd_sum = 0 self.done = False self.control_idx = 0 return
def train(self): print("==========================================================") print("Initializing constraint model training...") print("==========================================================") U.initialize() for epoch in range(self.epochs): # Just sample episodes for the whole epoch self._sample_steps(self.steps_per_epoch) # Do the update from memory losses = np.mean(np.concatenate([self._update_batch(batch) for batch in \ self.replay_buffer.get_sequential(self.batch_size)]).reshape(-1, self.num_constraints), axis=0) self.replay_buffer.clear() self._train_global_step += 1 print( f"Finished epoch {epoch} with losses: {losses}. Running validation ..." ) self.evaluate() print("----------------------------------------------------------") print("==========================================================")
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() #if not (arglist.display or arglist.restore or arglist.benchmark): # U.save_state(arglist.save_dir, saver=saver) # print("Saved first checkpoint") current_game_experiences = [] t0 = time.time() print('Starting iterations...') while True: new_experiences = load_new_experiences() for exp in new_experiences: obs_n, action_n, rew_n, new_obs_n, done_n, terminal = exp for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) U.save_state(arglist.save_dir, saver=saver)
def init_variables(self, info): # Here you have the information of the game (virtual init() in random_walk.cpp) # List: game_time, goal, number_of_robots, penalty_area, codewords, # robot_height, robot_radius, max_linear_velocity, field, team_info, # {rating, name}, axle_length, resolution, ball_radius # self.game_time = info['game_time'] self.field = info['field'] self.robot_size = 2*info['robot_radius'] self.goal = info['goal'] self.max_linear_velocity = info['max_linear_velocity'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self.cur_my_posture = [] self.cur_op_posture = [] self.cur_ball = [] self.pre_ball = [0, 0] self.state_dim = 2 # 3*my robots, relative to the ball position self.history_size = 2 # frame history size self.action_dim = 2 # 2 self.arglist = Argument() self.obs_shape_n = [(self.state_dim * self.history_size,) for _ in range(1)] # state dimenstion self.action_space = [spaces.Discrete(self.action_dim * 2 + 1) for _ in range(1)] self.trainers = self.get_trainers(1, self.obs_shape_n, self.action_space, self.arglist) # for tensorboard self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/aiwc_maddpg', U.get_session().graph) U.initialize() # Load previous results, if necessary if self.arglist.load_dir == "": self.arglist.load_dir = self.arglist.save_dir if self.arglist.display or self.arglist.restore or self.arglist.benchmark: print('Loading previous state...') U.load_state(self.arglist.load_dir) self.final_ep_rewards = [] # sum of rewards for training curve self.final_ep_ag_rewards = [] # agent rewards for training curve self.agent_info = [[[]]] # placeholder for benchmarking info self.saver = tf.train.Saver() self.obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(1)] # histories self.train_step = 0 self.wheels = np.zeros(self.number_of_robots*2) self.action_n = [np.zeros(self.action_dim * 2 + 1) for _ in range(1)] self.save_every_steps = 12000 # save the model every 10 minutes self.stats_steps = 6000 # for tensorboard self.reward_sum = 0 self.score_sum = 0 self.active_flag = [[False for _ in range(5)], [False for _ in range(5)]] self.inner_step = 0 self.done = False self.control_idx = 0 return
def play(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = get_num_adversaries(env) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward agent_info = [[[]]] # placeholder for benchmarking info obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # # collect experience # for i, agent in enumerate(trainers): # agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: print("train step: {}, episode reward: {}, time: {}".format( train_step, np.mean(episode_rewards[-1:]), round(time.time() - t_start, 3))) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for displaying learned policies time.sleep(0.1) env.render()
def init_variables(self, info): # Here you have the information of the game (virtual init() in random_walk.cpp) # List: game_time, goal, number_of_robots, penalty_area, codewords, # robot_height, robot_radius, max_linear_velocity, field, team_info, # {rating, name}, axle_length, resolution, ball_radius # self.game_time = info['game_time'] self.field = info['field'] self.robot_size = 2 * info['robot_radius'] self.goal = info['goal'] self.max_linear_velocity = info['max_linear_velocity'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self.cur_my_posture = [] self.cur_op_posture = [] self.cur_ball = [] self.pre_ball = [0, 0] self.state_dim = 2 # relative ball self.history_size = 2 # frame history size self.action_dim = 2 # 2 self.arglist = Argument() self.obs_shape_n = [(self.state_dim * self.history_size, ) for _ in range(1)] # state dimenstion self.action_space = [ Discrete(self.action_dim * 2 + 1) for _ in range(1) ] self.trainers = self.get_trainers(1, self.obs_shape_n, self.action_space, self.arglist) U.initialize() # Load previous results, if necessary if self.arglist.load_dir == "": self.arglist.load_dir = self.arglist.save_dir if self.arglist.display or self.arglist.restore or self.arglist.benchmark: print('Loading previous state...') U.load_state(self.arglist.load_dir) self.obs_n = [ np.zeros([self.state_dim * self.history_size]) for _ in range(self.number_of_robots) ] # histories self.wheels = np.zeros(self.number_of_robots * 2) self.action_n = [ np.zeros(self.action_dim * 2 + 1) for _ in range(self.number_of_robots) ] # not np.zeros(2) self.distances = [[i for i in range(5)], [i for i in range(5)]] # distances to the ball self.idxs = [[i for i in range(5)], [i for i in range(5)]] self.shoot_plan = [0 for _ in range(self.number_of_robots)] self.deadlock_cnt = 0 self.avoid_deadlock_cnt = 0 self.global_step = 0 return
def init_variables(self, info): # Here you have the information of the game (virtual init() in random_walk.cpp) # List: game_time, goal, number_of_robots, penalty_area, codewords, # robot_height, robot_radius, max_linear_velocity, field, team_info, # {rating, name}, axle_length, resolution, ball_radius # self.game_time = info['game_time'] self.field = info['field'] self.robot_size = 2 * info['robot_radius'] self.goal = info['goal'] self.max_linear_velocity = info['max_linear_velocity'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self.cur_my_posture = [] self.cur_op_posture = [] self.cur_ball = [] self.pre_ball = [0, 0] self.state_dim = 5 # ball, goal, theta self.history_size = 2 # frame history size self.action_dim = 2 # 2 self.arglist = Argument() self.obs_shape_n = [(self.state_dim * self.history_size, ) for _ in range(1)] # state dimenstion self.action_space = [ Discrete(self.action_dim * 2 + 1) for _ in range(1) ] self.trainers = self.get_trainers(1, self.obs_shape_n, self.action_space, self.arglist) U.initialize() # Load previous results, if necessary if self.arglist.load_dir == "": self.arglist.load_dir = self.arglist.save_dir if self.arglist.display or self.arglist.restore or self.arglist.benchmark: print('Loading previous state...') U.load_state(self.arglist.load_dir) self.episode_rewards = [0.0] # sum of rewards for all agents self.agent_rewards = [[0.0] for _ in range(self.number_of_robots) ] # individual agent reward self.final_ep_rewards = [] # sum of rewards for training curve self.final_ep_ag_rewards = [] # agent rewards for training curve self.agent_info = [[[]]] # placeholder for benchmarking info self.obs_n = [ np.zeros([self.state_dim * self.history_size]) for _ in range(self.number_of_robots) ] # histories self.wheels = np.zeros(self.number_of_robots * 2) self.action_n = [ np.zeros(self.action_dim * 2 + 1) for _ in range(self.number_of_robots) ] # not np.zeros(2) return
def init(self, arglist, env): num_thread = 1 tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_thread, intra_op_parallelism_threads=num_thread) self.sess = tf.InteractiveSession(config=tf_config) # To make sure that training and testing are based on diff seeds if arglist.restore: create_seed(np.random.randint(2)) else: create_seed(arglist.seed) # Create agent trainers self.obs_shape_n = [ env.observation_space[i].shape for i in range(env.n) ] self.num_adversaries = min(env.n, arglist.num_adversaries) self.trainers = get_trainers(env, self.num_adversaries, self.obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) self.episode_rewards = [0.0] # sum of rewards for all agents self.agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward self.final_ep_rewards = [] # sum of rewards for training curve self.final_ep_ag_rewards = [] # agent rewards for training curve self.agent_info = [[[]]] # placeholder for benchmarking info self.saver = tf.train.Saver() self.obs_n = env.reset() self.train_step = 0 self.t_start = time.time() self.new_episode = True # start of a new episode (used for replay buffer) self.start_saving_comm = False if arglist.graph: print("Setting up graph writer!") self.writer = tf.summary.FileWriter("learning_curves/graph", sess.graph) if arglist.analysis: print("Starting analysis on {}...".format(arglist.analysis)) if arglist.analysis != 'video': analyze.run_analysis(arglist, env, self.trainers) return # should be a single run
def train(self): print("==========================================================") print("Initializing constraint model training...") print("==========================================================") U.initialize() for epoch in range(self.epochs): # Just sample episodes for the whole epoch self._sample_steps(self.steps_per_epoch) # Do the update from memory '''if len(self.replay_buffer) < self.max_replay_buffer: # replay buffer is not large enough continue if not epoch % 100 == 0: # only update every 100 steps continue''' self.replay_sample_index = self.replay_buffer.make_index( self.batch_size) # collect replay sample from all agents index = self.replay_sample_index action, obs, c, c_next = self.replay_buffer.sample_index(index) obs = np.squeeze(obs, axis=1) action = np.squeeze(action, axis=1) c = np.squeeze(c, axis=1) c_next = np.squeeze(c_next, axis=1) # train the c_next network c_next_loss = [ self.c_next_train[_](*([obs] + [action] + [c] + [c_next])) for _ in range(self.num_constraints) ] self.replay_buffer.clear() self._train_global_step += 1 print( f"Finished epoch {epoch} with losses: {c_next_loss}. Running validation ..." ) print("----------------------------------------------------------") print("==========================================================")
def train(arglist): # To make sure that training and testing are based on diff seeds if arglist.restore: create_seed(np.random.randint(2)) else: create_seed(arglist.seed) with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) if arglist.analysis: print("Starting analysis on {}...".format(arglist.analysis)) if arglist.analysis != 'video': analyze.run_analysis(arglist, env, trainers) return # should be a single run episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() new_episode = True # start of a new episode (used for replay buffer) start_saving_comm = False if arglist.graph: print("Setting up graph writer!") writer = tf.summary.FileWriter("learning_curves/graph",sess.graph) print('Starting iterations...') while True: if arglist.actor_lstm: # get critic input states p_in_c_n, p_in_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64 if arglist.critic_lstm: q_in_c_n, q_in_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64 # get action action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] if arglist.critic_lstm: # get critic output states p_states = [p_in_c_n, p_in_h_n] if arglist.actor_lstm else [] update_critic_lstm(trainers, obs_n, action_n, p_states) q_out_c_n, q_out_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64 if arglist.actor_lstm: p_out_c_n, p_out_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64 # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): num_episodes = len(episode_rewards) # do this every iteration if arglist.critic_lstm and arglist.actor_lstm: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, p_in_c_n[i][0], p_in_h_n[i][0], p_out_c_n[i][0], p_out_h_n[i][0], q_in_c_n[i][0], q_in_h_n[i][0], q_out_c_n[i][0], q_out_h_n[i][0], new_episode) elif arglist.critic_lstm: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, q_in_c_n[i][0], q_in_h_n[i][0], q_out_c_n[i][0], q_out_h_n[i][0],new_episode) elif arglist.actor_lstm: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, p_in_c_n[i][0], p_in_h_n[i][0], p_out_c_n[i][0], p_out_h_n[i][0], new_episode) else: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, new_episode) obs_n = new_obs_n # Adding rewards if arglist.tracking: for i, a in enumerate(trainers): if arglist.num_episodes - len(episode_rewards) <= 1000: a.tracker.record_information("goal", np.array(env.world.landmarks[0].state.p_pos)) a.tracker.record_information("position",np.array(env.world.agents[i].state.p_pos)) a.tracker.record_information("ag_reward", rew_n[i]) a.tracker.record_information("team_dist_reward", info_n["team_dist"][i]) a.tracker.record_information("team_diff_reward", info_n["team_diff"][i]) # Closing graph writer if arglist.graph: writer.close() for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: new_episode = True num_episodes = len(episode_rewards) obs_n = env.reset() # reset trainers if arglist.actor_lstm or arglist.critic_lstm: for agent in trainers: agent.reset_lstm() if arglist.tracking: for agent in trainers: agent.tracker.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) else: new_episode=False # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # update all trainers, if not in display or benchmark mode loss = None # get same episode sampling if arglist.sync_sampling: inds = [random.randint(0, len(trainers[0].replay_buffer._storage)-1) for i in range(arglist.batch_size)] else: inds = None for agent in trainers: # if arglist.lstm: # agent.preupdate(inds=inds) # else: agent.preupdate(inds) for agent in trainers: loss = agent.update(trainers, train_step) if loss is None: continue # for displaying learned policies if arglist.display: env.render() # continue # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: # U.save_state(arglist.save_dir, saver=saver) if arglist.tracking: for agent in trainers: agent.tracker.save() rew_file_name = "rewards/" + arglist.commit_num + "_rewards.pkl" with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = "rewards/" + arglist.commit_num + "_agrewards.pkl" # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # [Initialization] # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize (Tensorflow initialization procedure) U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) # Parameters initialization episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # Get Action from Policy training. # environment step according to actions new_obs_n, rew_n, done_n, info_n = env.step( action_n ) # Receive the observation, the reward, the done and the information from the simulation environment. episode_step += 1 done = all(done_n) # Check if all tasks have been done. terminal = (episode_step >= arglist.max_episode_len ) # Check the timeout. # record experience to agents for i, agent in enumerate( trainers ): # The "done" may be the actions which has been executed at the past. agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # Record for the experience replay. obs_n = new_obs_n # Reset the current observation for i, rew in enumerate( rew_n ): # Update the total rewards and each agent's rewards episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: # Task finished or timeout, restart the simulation environment again. obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: # Save the agents' information. for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) # Delay. env.render() # Displaying the environment if necessary. continue # update all trainers, if not in display or benchmark mode [Important] loss = None for agent in trainers: agent.preupdate( ) # Clear the index randomly choosed by method 'make_index' --> 'agent.replay_sample_index = None' for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.make_session(8): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [[29] for i in range(env.n)] obs_map_shape_n =[[56*86] for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env._reset() episode_step = 13000 train_step = 0 t_start = time.time()
def train(arglist): with U.single_threaded_session(): # Create environment env = StarCraft2Env(map_name=arglist.scenario, reward_only_positive=False, obs_last_action=True, obs_timestep_number=True, reward_scale_rate=200) # Create agent trainers env_info = env.get_env_info() num_agents = env_info["n_agents"] num_adversaries = num_agents obs_shape_n = [(env_info["obs_shape"], ) for i in range(num_adversaries)] action_space_n = [ env_info["n_actions"] for i in range(num_adversaries) ] buffer_size = arglist.buffer_size trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n, arglist, buffer_size) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() logdir = "./tensorboard/" Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(num_agents)] # individual agent reward saver = tf.train.Saver(max_to_keep=100000000) n_actions_no_attack = 6 env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 step = 0 print('Starting iterations...') while True: # get action action_set_actual = [] action_set_execute = [] action_n = [] dead_unit = [] for agent_id in range(num_agents): action_output = trainers[agent_id].action(obs_n[agent_id]) action_n.append(action_output) action_prob = action_output action_to_choose = np.argmax(action_prob) action_set_actual.append(action_to_choose) avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] if action_to_choose in avail_actions_ind: action_set_execute.append(action_to_choose) elif (avail_actions[0] == 1): action_set_execute.append( 0) # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作 else: action_set_execute.append(1) # 如果该动作不能执行,那么就用STOP动作代替 if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): # 判断该智能体是否已经死亡 dead_unit.append(agent_id) rew_base, done, _ = env.step(action_set_execute) episode_rewards[-1] += rew_base new_obs_n = [] reward_hl_own_new = [] reward_hl_en_new = [] rew_n = [] for agent_id in range(num_agents): obs_next = env.get_obs_agent(agent_id=agent_id) new_obs_n.append(obs_next) reward_hl_own_new.append(env.get_agent_health(agent_id)) reward_hl_en_new.append(env.get_enemy_health(agent_id)) for agent_id in range(num_agents): if (agent_id in dead_unit): reward = 0 elif (action_set_execute[agent_id] != action_set_actual[agent_id] ): #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励 reward = -2 elif (action_set_execute[agent_id] > 5): target_id = action_set_execute[ agent_id] - n_actions_no_attack health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): if (rew_base > 0): reward = 2 + rew_base else: reward = 2 else: reward = 1 else: reward = (reward_hl_own_new[agent_id] - reward_hl_own_old[agent_id]) * 5 rew_n.append(reward) episode_step += 1 # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done) obs_n = new_obs_n reward_hl_own_old = reward_hl_own_new reward_hl_en_old = reward_hl_en_new for i, rew in enumerate(rew_n): agent_rewards[i][-1] += rew if done: print("steps until now : %s, episode: %s, episode reward: %s" % (step, len(episode_rewards), episode_rewards[-1])) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("episode reward", episode_rewards[-1]) for i in range(num_agents): logger.record_tabular("agent" + str(i) + " episode reward", agent_rewards[i][-1]) logger.dump_tabular() env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) # increment global step counter step += 1 if (step == arglist.buffer_size): print("Training starts.") # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, step) # save model, display training output if done and (len(episode_rewards) % arglist.save_rate == 0): save_dir = arglist.save_dir + "/model_" + str( step) + "steps/" + arglist.exp_name U.save_state(save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}". format(step, len(episode_rewards), np.mean( episode_rewards[-arglist.save_rate:]))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}" .format(step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: print('...Finished total of {} episodes.'.format( len(episode_rewards) - 1)) break
def train(arglist, extra_args=None): tf_graph = tf.Graph() tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True with tf.Session(graph=tf_graph, config=tf_config): # Create environment env = make_env(arglist.scenario, arglist) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] if arglist.num_adversaries is None: arglist.num_adversaries = len([ agent for agent in env.agents if (hasattr(agent, "adversary") and agent.adversary) ]) arglist.num_adversaries = min(env.n, arglist.num_adversaries) num_adversaries = arglist.num_adversaries trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() if os.environ.get("OUTPUT_GRAPH"): tf.summary.FileWriter(os.path.join(logger.get_dir(), "tb"), U.get_session().graph) # Load previous results, if necessary if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=None) obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # print("[action] " + ", ".join(["agent {i}: {action}".format(i=i, action=list(action_n[i])) for i in range(len(action_n))])) # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: if arglist.save_render_images: input_file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_%d.png".format( len(episode_rewards))) output_file_name = os.path.join( arglist.render_dir, "video-episode_{}.mp4".format(len(episode_rewards))) command = "ffmpeg -y -r 10 -i {} {}".format( input_file_name, output_file_name) os.system(command) print("Saved render video at {}".format(output_file_name)) for episode_step_ in range(episode_step): file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_{}.png".format( len(episode_rewards), episode_step_)) if os.path.exists(file_name): os.remove(file_name) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = os.path.join(arglist.benchmark_dir, 'benchmark.pkl') print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) if arglist.save_render_images: images = env.render(mode="rgb_array") image = images[0] file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_{}.png".format( len(episode_rewards), episode_step)) plt.imsave(file_name, image) print("Saved render image at {}".format(file_name)) else: env.render(mode="human") continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(os.path.join( arglist.save_dir, "checkpoint-episode_{}".format(len(episode_rewards))), saver=saver) # print training scalars if terminal and ((len(episode_rewards) % arglist.print_rate == 0) or (len(episode_rewards) % arglist.save_rate == 0)): # print statement depends on whether or not there are adversaries logger.log("Time: {}".format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) logger.logkv("steps", train_step) logger.logkv("episodes", len(episode_rewards)) logger.logkv("mean_episode_reward", np.mean(episode_rewards[-arglist.save_rate:])) if num_adversaries == 0: # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), # train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) pass else: for agent_index in range(len(agent_rewards)): logger.logkv( "agent_{}_episode_reward".format(agent_index), np.mean(agent_rewards[agent_index] [-arglist.save_rate:])) # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), # train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), # [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) logger.logkv("time", round(time.time() - t_start, 3)) logger.dumpkvs() t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = os.path.join(arglist.plots_dir, 'rewards.pkl') with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = os.path.join(arglist.plots_dir, 'average_rewards.pkl') with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) saver = tf.train.Saver() # Initialize U.initialize() summary_writer = tf.summary.FileWriter(arglist.summary_dir, sess.graph) summary_placeholders, update_ops, summary_op = setup_summary() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) #saver.restore(sess, "/home/sugon/Peixian/maddpg_peixian/maddpg/experiments/tmp/policy/simple_comm_-4166440") #print ("susessfully restor") episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=3) obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() adversary_rewards = 0.0 goodagent_rewards = 0.0 print('Starting iterations...') while True: #input('...') # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): #print (i,":",rew_n[i]) episode_rewards[-1] += rew agent_rewards[i][-1] += rew if i < num_adversaries: adversary_rewards += rew else: goodagent_rewards += rew if done or terminal: if done: print("*" * 20) print("done:", episode_step) stats = [adversary_rewards, episode_step, goodagent_rewards] for i in range(len(stats)): sess.run( update_ops[i], feed_dict={summary_placeholders[i]: float(stats[i])}) summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, len(episode_rewards) + 1) obs_n = env.reset() episode_step = 0 adversary_rewards = 0.0 goodagent_rewards = 0.0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if (done or terminal) and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, train_step, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) obs_n = env.reset( ) # so that env.observation_space is initialized so trainers can be initialized # Create agent trainers num_adversaries = arglist.num_adversaries obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] print("env.observation_space:", env.observation_space) print("num adversaries: ", num_adversaries, ", env.n (num agents): ", env.n) #need to ensure that the trainer is in correct order. pacman in front trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir + ("{}".format( arglist.load_episode)) if arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) if arglist.display and arglist.load: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [[] for i in range(env.n) ] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=None) episode_step = 0 train_step = 0 total_win = [0] final_win = [] total_lose = [0] final_lose = [] t_start = time.time() loss_list = {} for i in range(env.n): loss_list[i] = [[] for i in range(6)] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done, info_n, win, lose = env.step(action_n) episode_step += 1 terminal = (episode_step >= arglist.max_episode_len) # print("obs_n", obs_n) # print("new_obs_n", new_obs_n) #print("action_n", action_n) # print("rew_n",episode_step, rew_n) # print("done", done) # print("terminal", terminal) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done, terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: if arglist.display: env.render() obs_n = env.reset() episode_step = 0 if win: total_win[-1] += 1 if lose: total_lose[-1] += 1 total_win.append(0) total_lose.append(0) episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # if train_step % 1000 == 0: # print(train_step) # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for ind, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if train_step % 10000 == 0 and loss != None: for i in range(len(loss)): loss_list[ind][i].append(loss[i]) # save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): saving = arglist.save_dir + ( "{}".format(0 + len(episode_rewards)) ) #TODO why append this U.save_state(saving, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, number of wins {}, number of lose {}, " "time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], np.sum(total_win[-arglist.save_rate:]), np.sum(total_lose[-arglist.save_rate:]), round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) final_win.append(np.sum(total_win[-arglist.save_rate:])) final_lose.append(np.sum(total_lose[-arglist.save_rate:])) ep_reward_df = pd.DataFrame(final_ep_rewards) ep_ag_reward_df = pd.DataFrame(final_ep_ag_rewards) win_df = pd.DataFrame(final_win) lose_df = pd.DataFrame(final_lose) for i in range(env.n): trainer_loss_df = pd.DataFrame(loss_list[i]).transpose() trainer_loss_df.to_csv(arglist.plots_dir + arglist.exp_name + '_trainer_loss_df_{}.csv'.format(i)) ep_reward_df.to_csv(arglist.plots_dir + arglist.exp_name + '_rewards.csv') ep_ag_reward_df.to_csv(arglist.plots_dir + arglist.exp_name + '_agrewards.csv') win_df.to_csv(arglist.plots_dir + arglist.exp_name + '_win_df.csv') lose_df.to_csv(arglist.plots_dir + arglist.exp_name + '_lose_df.csv') for i, rew in enumerate(agent_rewards): final_ep_ag_rewards[i].append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: # rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' # with open(rew_file_name, 'wb') as fp: # pickle.dump(final_ep_rewards, fp) # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' # with open(agrew_file_name, 'wb') as fp: # pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] board_write_path = './board/' + datetime.now().strftime("%Y%m%d_%H%M%S") os.makedirs(board_write_path) board_writer = tf.summary.FileWriter(board_write_path) trainers = get_trainers(env, obs_shape_n, arglist, board_writer) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info evaluate_rewards = [] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step action_n_saved = deepcopy(action_n) if arglist.display: for idx, (agent, obs) in enumerate(zip(trainers, obs_n)): action_result = agent.p_debug['p_values'](obs[None])[0] print("agent_%d" % idx, action_result) new_obs_n, rew_n, done_n, info_n = env.step(action_n) action_n = action_n_saved episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue if arglist.display: continue # update all trainers, if not in display or benchmark mode if train_step % 100 == 0 and len(trainers[0].replay_buffer) >= trainers[0].max_replay_buffer_len: loss = None replay_sample_index = trainers[0].get_memory_index() obs_n_sampled = [] obs_next_n_sampled = [] act_n_sampled = [] for agent in trainers: agent.set_memory_index(replay_sample_index) obs_sampled, act_sampled, _, obs_next_sampled, _ = agent.get_replay_data() obs_n_sampled.append(obs_sampled) obs_next_n_sampled.append(obs_next_sampled) act_n_sampled.append(act_sampled) target_act_next_n = [] for agent in trainers: target_act_next_n.append(agent.get_target_act(obs_next_n_sampled)) for agent in trainers: loss = agent.update(train_step, obs_n_sampled, act_n_sampled, obs_next_n_sampled, target_act_next_n) import math if math.isnan(episode_rewards[-1]): print("NaN occurred! ") break # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) evaluate_rewards.append(evaluate(arglist, trainers, is_toy=True)) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) with open(arglist.plots_dir + arglist.exp_name + "_evaluate_rewards.pkl", 'wb') as fp: pickle.dump(evaluate_rewards, fp) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() current_game_experiences = [] t0 = time.time() print('Starting iterations...') while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience #for i, agent in enumerate(trainers): #agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) current_game_experiences.append((obs_n, action_n, rew_n, new_obs_n, done_n, terminal)) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: #U.save_state(arglist.save_dir, saver=saver) #print("SAVED") obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) if len(episode_rewards) % 200 == 0 and not arglist.display: fname = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S.%f') + ".pkl" with open("../../worker_experiences/" + fname, 'wb') as fp: print("\n[%d] Finished 200 games in %.2f seconds" % (len(episode_rewards), time.time() - t0)) pickle.dump(current_game_experiences, fp) print("Saved experience file " + fname) print('Loading latest networks...') t0 = time.time() try: U.load_state(arglist.load_dir) print("Latest networks loaded in %.2f seconds" % (time.time() - t0)) t0 = time.time() except tf.python.framework.errors_impl.DataLossError: print("Couldn't read latest network, it's probably being written...") current_game_experiences = [] # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.05) env.render() if arglist.video: video_maker.save_frame(episode_step) if terminal and len(episode_rewards) % 5 == 0: if arglist.video: video_maker.combine_frames_to_video("../../videos/test_video.mp4") clear_folder("../../frames/") t0 = time.time() try: U.load_state(arglist.load_dir) print("Latest networks loaded in %.2f seconds" % (time.time() - t0)) t0 = time.time() except tf.python.framework.errors_impl.DataLossError: print("Couldn't read latest network, it's probably being written...") continue # update all trainers, if not in display or benchmark mode #loss = None #for agent in trainers: # agent.preupdate() #for agent in trainers: # loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): #U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode abs-reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(np.abs(episode_rewards[-arglist.save_rate:])), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode abs-reward: {}, agent episode abs-reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(np.abs(episode_rewards[-arglist.save_rate:])), [np.mean(np.abs(rew[-arglist.save_rate:])) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist): """ Run MADDPG algorithm using passed in commandline arguments Args: arglist (argparse.Namespace): Parsed commandline arguments object """ tf.reset_default_graph() if arglist.seed is not None: np.random.seed(arglist.seed) tf.set_random_seed(arglist.seed) with tf_util.make_session(config=None, num_cpu=1, make_default=False, graph=None): # with tf_util.single_threaded_session(): ########################################### # Create environment # ########################################### env = make_env(arglist.scenario, arglist=arglist, done=arglist.done_callback, logging=arglist.logging, benchmark=arglist.benchmark) ########################################### # Create agent trainers # ########################################### obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print("Number of Adversaries: {}".format(num_adversaries)) print('Experiment: {}. Using good policy {} and adv policy {}'.format( arglist.exp_name, arglist.good_policy, arglist.adv_policy)) ########################################### # Initialize # ########################################### tf_util.initialize() ########################################### # Load previous results, if necessary # ########################################### if arglist.load_dir == "": arglist.load_dir = arglist.save_dir # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None: if arglist.restore or arglist.benchmark or arglist.load_dir is not None: print('Loading previous state...') # Set model file if arglist.model_file == "": arglist.model_file = arglist.exp_name print("Model File: " + arglist.load_dir + arglist.model_file) tf_util.load_state(arglist.load_dir + arglist.model_file) ########################################### # Create the save directory # ########################################### if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir, exist_ok=True) if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir, exist_ok=True) ########################################### # Set parameters # ########################################### # Sum of rewards for all agents episode_rewards = [0.0] # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents # Individual agent reward # agent_rewards = [[0.0] for _ in range(env.n)] agent_rewards = [[0.0] for _ in range(len(env.world.agents))] # Retrieve previous episode count try: prev_ep_ct = int(arglist.model_file.split("_")[-1]) except ValueError: print("Starting from untrained network...") prev_ep_ct = 0 ep_ct = prev_ep_ct + arglist.num_episodes # Sum of rewards for training curve final_ep_rewards = [] # Agent rewards for training curve final_ep_ag_rewards = [] # Placeholder for benchmarking info agent_info = [[[]]] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() progress = False # Save more often if you have fewer episodes arglist.save_rate = min(arglist.save_rate, arglist.num_episodes) # Initialize loss file for each agent if arglist.log_loss: for i in range(len(env.world.agents)): log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True) ########################################### # Start # ########################################### print('Starting iterations...') while True: # TODO: Switch to is isinstance() # if type(env.world.scripted_agents[0].action) == type(None): # print("Error") # Get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # Environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) # Logging step if arglist.logging: env.log( len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n, rew_n, done_n, info_n) # Update information episode_step += 1 # Check if all agents are done # done = all(done_n) # Check if any agents are done done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) # Collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() if done or terminal: print('Episode Reward: {}'.format( [rew[-1] for rew in agent_rewards])) time.sleep(0.5) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) continue if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # Increment global step counter train_step += 1 # For benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # In testing mode, don't perform model updates if arglist.testing: if len(episode_rewards) > arglist.num_episodes: print("episodes: {}, " "mean episode reward: {}, time: {}".format( len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(prev_ep_ct) + arglist.log_append) break continue # Update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for i, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if arglist.log_loss and loss is not None: log_loss(arglist, ep_ct, "agent_{}".format(i), loss=loss[1]) if len(episode_rewards) % 100 == 0 and progress: print("Episode {} Reached. Time: {}".format( len(episode_rewards), time.time() - t_start)) progress = False elif len(episode_rewards) % 100 != 0 and not progress: progress = True # Save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): # TODO: Implement some checks so that we don't overwrite old networks unintentionally? # Save model state tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' + str(len(episode_rewards) + prev_ep_ct), saver=saver) # Print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(reward[-arglist.save_rate:]) for reward in agent_rewards ], round(time.time() - t_start, 3))) # Reset start time to current time t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for reward in agent_rewards: final_ep_ag_rewards.append( np.mean(reward[-arglist.save_rate:])) # Saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) # Log agent data for run env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(len(episode_rewards) + prev_ep_ct)) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
strargs = ['--benchmark', '--deterministic'] + unknown_args arglist = parse_args(strargs) #tf.reset_default_graph() #tf.InteractiveSession().as_default() with tf.Session().as_default(): # Create environment env = make_env('simple_spread', arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) #print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() os.path.splitext(args.input_file) print('Loading previous state...') U.load_state(args.policy_file) actions = trainers[0].act(states1[0]) assert np.allclose(actions1[0], actions) actions = trainers[1].act(states2[0]) assert np.allclose(actions2[0], actions) actions = trainers[2].act(states3[0]) assert np.allclose(actions3[0], actions) h1_values = trainers[0].p_debug['h1_values'] h2_values = trainers[0].p_debug['h2_values']
def train(arglist): with U.make_session(8): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [[29] for i in range(env.n)] obs_map_shape_n =[[56*86] for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env._reset() episode_step = 13000 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action #action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] action_n=[] for agent, obs in zip(trainers,obs_n): #print(obs) t=agent.action(obs) d=np.argmax(t) if d%5==4: rt=random.randint(0,20) if rt<4: swap=t[d] t[d]=t[d-rt-1] t[d-rt-1]=swap else: rt=random.randint(0,80) if rt<4: swap=t[d] t[d]=t[d//5*5+rt] t[d//5*5+rt]=swap action_n.append(t) #print(action_n) # environment step new_obs_n, rew_n, done_n, info_n = env._step(action_n) #print(rew_n) episode_step += 1 env.training_episode=episode_step done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env._reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.online_display or arglist.display: time.sleep(0.01) #if rew_n[2]>0: pdb.set_trace() env._render(close=False) print(rew_n) # if (rew_n[2]>0) or (rew_n[0]>0) or (rew_n[1]>0): # pdb.set_trace() #pdb.set_trace() if arglist.display: continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) print("number of adversaries is: ", num_adversaries) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') print("path is: ", arglist.load_dir) print("restoring checkpoints") # added for selective training. # Make it general for other environments as well later. if arglist.scenario == "simple_tag": print("inside simple tag") if not arglist.train_adversaries: print("loading only positive") print("number of adversaries are: ", num_adversaries) saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_" + str(num_adversaries))) print( "var list is: ", tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_" + str(num_adversaries))) if not arglist.train_positive_agent: print("only loading adversaries") var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_0") print("var list is: ", var_list) for l in range(1, arglist.num_adversaries): var_list += tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_" + str(l)) saver = tf.train.Saver(var_list=var_list) U.load_state(arglist.load_dir, saver=saver) else: U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward if arglist.restore: final_ep_rewards = list( np.load(arglist.plots_dir + arglist.exp_name + '_episode_rewards.npy')) final_ep_ag_rewards = list( np.load(arglist.plots_dir + arglist.exp_name + '_agent_rewards.npy')) final_ep_ag_rewards = [list(a) for a in final_ep_ag_rewards] else: final_ep_rewards = [] # sum of rewards for training curve # final_ep_ag_rewards = [] # agent rewards for training curve final_ep_ag_rewards = [[0.0] for _ in range(env.n) ] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print("number of agents in the environment are: ", env.n) episode_avg_rewards = [0.0] agent_avg_rewards = [[0.0] for _ in range(env.n)] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 # this should perhaps be done later. episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None # for agent in trainers: # agent.preupdate() # for agent in trainers: # loss = agent.update(trainers, train_step) for m in range(0, len(trainers)): agent = trainers[m] if not arglist.train_adversaries and m > num_adversaries: # print("updating positive") agent.preupdate() if not arglist.train_positive_agent and m <= num_adversaries: # print("updating adversary") agent.preupdate() if arglist.train_positive_agent and arglist.train_adversaries: # print("updating both") agent.preupdate() for m in range(0, len(trainers)): agent = trainers[m] if not arglist.train_adversaries and m > num_adversaries: loss = agent.update(trainers, train_step) if not arglist.train_positive_agent and m <= num_adversaries: loss = agent.update(trainers, train_step) if arglist.train_positive_agent and arglist.train_adversaries: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:-1])) # for rew in agent_rewards: # final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # for rew in agent_rewards: for j in range(len(agent_rewards)): rew = agent_rewards[j] final_ep_ag_rewards[j].append( np.mean(rew[-arglist.save_rate:-1])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) agent_rewards = np.array(final_ep_ag_rewards) episode_rewards = np.array(final_ep_rewards) np.save( arglist.plots_dir + arglist.exp_name + '_agent_rewards.npy', agent_rewards) np.save( arglist.plots_dir + arglist.exp_name + '_episode_rewards.npy', episode_rewards) fig, ax = plt.subplots() for k in range(len(agent_rewards)): ax.plot(agent_rewards[k], label="agent_" + str(k)) ax.plot(episode_rewards, label="total") ax.legend() plt.savefig(arglist.plots_dir + arglist.exp_name + '_plot.png') plt.show() break
def train(arglist, PID=None, lock=None): start_time = time.time() # global replay_buffer with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agents networks obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] ####changed by yuan li num_adversaries = copy.deepcopy(env.num_adversaries) arglist.num_adversaries = copy.deepcopy(num_adversaries) if comm_rank==0: req = None wait_flag = False data = 0 a = 0 number = 0 actors = get_agents(env, num_adversaries, obs_shape_n, arglist) U.initialize() while True: if not wait_flag: req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11) wait_flag = True else: if a >= 3: break data_recv = req.test() if data_recv[0]: a += 1 wait_flag = False i = 0 j = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: var.load(data_recv[1][j], sess) j += 1 i += 1 print("rank0 updata param:000000000000000000000, step:", a) else: if number<=2: data = data + number * 100 comm.send(data, dest=(comm_rank + 1) % comm_size, tag=11) number+=1 print("rank:{}, step:{}, send data:{}".format(comm_rank, a, data)) if comm_rank==1: wait_flag = False req = None sample = 0 step = 0 data = 0 while True: if not wait_flag: req = comm.irecv(source=(comm_rank - 1 + comm_size) % comm_size, tag=11) wait_flag = True else: data_recv = req.test() if data_recv[0]: sample += 1 wait_flag=False print("rank:{}, step:{}, recv data:{}".format(comm_rank, step, data_recv[1])) if sample==3: break else: wait_flag = True #if step >= 3: # break if step<=2: data = data + step*10000 comm.send(data, dest=(comm_rank + 1) % comm_size, tag=11) step+=1 print("rank:{}, step:{}, send data:{}".format(comm_rank, step, data)) if comm_rank == 2: step = 0 learners = get_agents(env, num_adversaries, obs_shape_n, arglist) U.initialize() while True: if step >= 3: break else: data_recv = comm.recv(source=(comm_rank - 1) % comm_size, tag=11) print("rank:{}, step:{}, recv data:{}".format(comm_rank, step, data_recv)) param = [] i = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: param.append(sess.run(var)) i += 1 comm.send(param, dest=(comm_rank + 1) % comm_size, tag=11) step += 1 print("rank2 send param:22222222222222222222, step:", step)
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) state_shape_n = [(64, ) for i in range(env.n)] trainers = get_trainers(env, num_adversaries, obs_shape_n, state_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward episode_begin_num = 0 # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) fname = './learning_curves/' + arglist.exp_name + '_rewards.pkl' final_ep_rewards = pickle.load(open(fname, 'rb')) fname = './learning_curves/' + arglist.exp_name + '_agrewards.pkl' final_ep_ag_rewards = pickle.load(open(fname, 'rb')) episode_begin_num = arglist.save_rate * len(final_ep_rewards) final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() state_n = [agent.p_init_state(1) for agent in trainers] pred_n = [agent.init_pred(1) for agent in trainers] episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: ## get action temp = [ agent.take_action(obs, state, pred) for agent, obs, state, pred in zip( trainers, obs_n, state_n, pred_n) ] action_n = [x[0] for x in temp] new_state_n = [x[1] for x in temp] gru_out_n = [x[2] for x in temp] new_pred_n = [ agent.predict(act[None], gru_out) for agent, act, gru_out in zip(trainers, action_n, gru_out_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience ## need to be modified for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n state_n = new_state_n # pred_n = [x.eval() for x in new_pred_n] pred_n = new_pred_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() state_n = [agent.p_init_state(1) for agent in trainers] pred_n = [agent.init_pred(1) for agent in trainers] episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.05) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step, arglist.step_size, arglist.burn_in_step) # save model, display training output episode_num = len(episode_rewards) + episode_begin_num if terminal and (episode_num % arglist.save_rate == 0): # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, episode_num, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, episode_num, np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(episode_num)) U.save_state(arglist.save_dir, saver=saver) if episode_num > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist, PID=None, lock=None): start_time = time.time() # global replay_buffer with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agents networks obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] ####changed by yuan li num_adversaries = copy.deepcopy(env.num_adversaries) arglist.num_adversaries = copy.deepcopy(num_adversaries) if comm_rank != 0 and comm_rank != 1: req = None wait_flag = False actors = get_agents(env, num_adversaries, obs_shape_n, arglist) U.initialize() #var_list = [var for var in tf.trainable_variables()] #加载模型 var_list_n = [] for actor in actors: var_list_n.extend(actor.get_variable_list()) saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables( env) obs_n = env.reset() step = 0 episode_step = 0 sample_number = 0 t_start = time.time() updata_time = 0 print('Starting iterations...') invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 while True: if not wait_flag: #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11) req = comm.irecv(350000, source=0, tag=11) wait_flag = True else: data_recv = req.test() if data_recv[0]: wait_flag = False if data_recv[1] == 'finish': #finish = True comm.send('finish', dest=1, tag=11) break else: update_start = time.time() i = 0 j = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: var.load(data_recv[1][j], sess) j += 1 i += 1 #for var in var_list: # var.load(data_recv[1][i], sess) # i += 1 #print("111111111111111111111111,load param") #for i, actor in enumerate(actors): # actor.load_weights(data_recv[1][i], sess) update_end = time.time() #print("step:{}, rank0_update_end_time:{}".format(step, update_end)) updata_time += (update_end - update_start) step += 1 else: wait_flag = True # get action action_n = [ agent.action(obs) for agent, obs in zip(actors, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 # changed by liyuan done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) ###liyuan: compute the arverage win rate if green_leave_screen(env) or adversary_all_die( env) or adversary_leave_screen(env): terminal = True if adversary_all_die(env): green_win += 1 if green_leave_screen(env): invalid_train += 1 green_leave += 1 if adversary_leave_screen(env): red_leave += 1 if episode_step >= arglist.max_episode_len: for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 50 if adversary_all_die(env): for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 100 if done: red_win = red_win + 1 for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] += 200 rew_n[i] += ( arglist.max_episode_len - episode_step) / arglist.max_episode_len #send data data = [obs_n, action_n, rew_n, new_obs_n, done_n] comm.send(data, dest=1, tag=11) sample_number += 1 #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): if red_win >= 0.8 * arglist.save_rate: temp_dir = arglist.save_dir + "_" + str( len(episode_rewards)) + "_" + str( red_win) + "_{}".format(PID) U.save_state(temp_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), round(time.time() - t_start, 3))) else: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) print( "Rank {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}" .format(comm_rank, red_win, green_win, red_leave, green_leave)) middle_time = time.time() print( "sample_number:{}, train_step:{}, update_time:{}, total_time:{}" .format(sample_number, step, updata_time, middle_time - start_time)) mydata = [] mydata.append(str(len(episode_rewards))) mydata.append( str( np.mean(episode_rewards[-arglist. save_rate:]))) mydata.append( str( np.mean(agent_rewards[0] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[1] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[2] [-arglist.save_rate:]))) mydata.append(str(red_win)) mydata.append( str(round(time.time() - t_start, 3))) out = open('1mydata_{}.csv'.format(comm_rank), 'a', newline='') csv_write = csv.writer(out, dialect='excel') csv_write.writerow(mydata) if len(episode_rewards) > 3000: U.save_state(arglist.save_dir, saver=saver) invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) end_time = time.time() print("rank{}_time:{}".format(comm_rank, end_time - start_time)) print("rank{}_update_time:{}".format(comm_rank, updata_time)) print("rank{}_step:{}".format(comm_rank, step)) if comm_rank == 1: replay_buffer = ReplayBuffer(1e6) wait_flag_1 = False wait_flag_2 = False wait_flag_3 = False req1 = None req2 = None req3 = None sample = 0 step = 0 req_list = [] while True: if not wait_flag_1 or not wait_flag_2 or not wait_flag_3: if not wait_flag_1: req1 = comm.irecv(source=2, tag=11) wait_flag_1 = True if not wait_flag_2: req2 = comm.irecv(source=3, tag=11) wait_flag_2 = True if not wait_flag_3: req3 = comm.irecv(source=4, tag=11) wait_flag_3 = True else: data_recv_1 = req1.test() data_recv_2 = req2.test() data_recv_3 = req3.test() if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]: if data_recv_1[0]: wait_flag_1 = False if data_recv_1[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_2[0]: wait_flag_2 = False if data_recv_2[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_3[0]: wait_flag_3 = False if data_recv_3[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 ''' #计算接收100个样本然后发送样本用的时间 if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len: start = time.time() replay_sample_index = replay_buffer.make_index(arglist.batch_size) send_data = replay_buffer.sample_index(replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11) sample = 0 step += 1 end = time.time() print("rank1 send sample time:", end-start) ''' else: wait_flag_1 = True wait_flag_2 = True wait_flag_3 = True if (sample // 100 > 0) and len( replay_buffer ) >= arglist.batch_size * arglist.max_episode_len: replay_sample_index = replay_buffer.make_index( arglist.batch_size) send_data = replay_buffer.sample_index( replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=0, tag=11) sample = 0 step += 1 end_time = time.time() print("rank1_time:", end_time - start_time) print("rank1_step", step) if comm_rank == 0: extract_time = 0 step = 0 learners = get_agents(env, num_adversaries, obs_shape_n, arglist) var_list_n = [] for learner in learners: var_list_n.extend(learner.get_variable_list()) U.initialize() #var_list = [var for var in tf.trainable_variables()] # 加载模型 saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) while True: if step >= STEP: for i in range(comm_size - 2): comm.send('finish', dest=(i + 2), tag=11) break else: start = time.time() data_recv = comm.recv(source=1, tag=11) for i, agent in enumerate(learners): agent.update(learners, data_recv) #dict_list = [] param = [] extract_start = time.time() i = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: param.append(sess.run(var)) i += 1 #print("2222222222222222 load weights") #for var in var_list: # param.append(sess.run(var)) extract_end = time.time() extract_time += (extract_end - extract_start) for i in range(comm_size - 2): comm.send(param, dest=(i + 2), tag=11) #print("222222222222222222222222,send param") step += 1 end = time.time() #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start)) end_time = time.time() print("rank0_time:", end_time - start_time) print("rank0_extract_time:", extract_time) print("rank0_step:", step)
def train(arglist): # random.seed(arglist.random_seed) # np.random.seed(arglist.random_seed) # tf.set_random_seed(arglist.random_seed) with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() savers = [ tf.train.Saver(U.scope_vars(trainer.name)) for trainer in trainers ] # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') # U.load_state(arglist.load_dir) [ U.load_state(os.path.join(arglist.load_dir, 'team_{}'.format(i)), saver=saver) for i, saver in enumerate(savers) ] episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 if arglist.trainer == 'tarmac' or arglist.trainer == 'reuse_tarmac' or arglist.trainer == 'ibmac_inter': message_n = np.zeros([len(obs_n), 4]) is_training = True t_start = time.time() writer = tf.summary.FileWriter("graph", U.get_session().graph) writer.close() writer = SummaryWriter(arglist.save_dir) print('Starting iterations...') while True: # get action if arglist.trainer == 'ibmac' or arglist.trainer == 'reuse_ibmac': is_inference = False if arglist.display or arglist.restore or arglist.benchmark: is_inference = False if len(trainers) == 2: action_n1 = trainers[0].action(obs_n[:num_adversaries], is_inference=is_inference) action_n2 = trainers[1].action(obs_n[num_adversaries:], is_inference=is_inference) action_n = [action[0] for action in action_n1 ] + [action[0] for action in action_n2] else: action_n = trainers[0].action(obs_n, is_inference=is_inference) action_n = [action[0] for action in action_n] elif arglist.trainer == 'ibmac_inter': if len(trainers) == 2: action_n1, message_action_n1 = trainers[0].action( obs_n[:num_adversaries], message_n[:num_adversaries]) action_n2, message_action_n2 = trainers[1].action( obs_n[num_adversaries:], message_n[num_adversaries:]) action_n = [action[0] for action in action_n1 ] + [action[0] for action in action_n2] else: action_n, message_action_n = trainers[0].action( obs_n, message_n) action_n = [action[0] for action in action_n] message_n = [ message_action[0] for message_action in message_action_n ] else: action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience if arglist.trainer == 'ibmac': if len(trainers) == 2: trainers[0].experience(obs_n[:num_adversaries], action_n[:num_adversaries], rew_n[:num_adversaries], new_obs_n[:num_adversaries], done_n[:num_adversaries], terminal) trainers[1].experience(obs_n[num_adversaries:], action_n[num_adversaries:], rew_n[num_adversaries:], new_obs_n[num_adversaries:], done_n[num_adversaries:], terminal) else: trainers[0].experience(obs_n, action_n, rew_n, new_obs_n, done_n, terminal) elif arglist.trainer == 'ibmac_inter': if len(trainers) == 2: trainers[0].experience(obs_n[:num_adversaries], message_n[:num_adversaries], action_n[:num_adversaries], rew_n[:num_adversaries], new_obs_n[:num_adversaries], done_n[:num_adversaries], terminal) trainers[1].experience(obs_n[num_adversaries:], message_n[:num_adversaries], action_n[num_adversaries:], rew_n[num_adversaries:], new_obs_n[num_adversaries:], done_n[num_adversaries:], terminal) else: trainers[0].experience(obs_n, message_n, action_n, rew_n, new_obs_n, done_n, terminal) else: for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for i, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if loss: if isinstance(agent, IBMACAgentTrainer) or isinstance( agent, ReuseIBMACAgentTrainer): q_loss, p_loss, _, _, _, _, kl_loss = loss writer.add_scalar('agent_{}/loss_kl'.format(i), kl_loss, train_step) else: q_loss, p_loss, _, _, _, _ = loss writer.add_scalar('agent_{}/loss_policy'.format(i), p_loss, train_step) writer.add_scalar('agent_{}/loss_critic'.format(i), q_loss, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) [ U.save_state(os.path.join(arglist.save_dir, 'team_{}'.format(i)), saver=saver) for i, saver in enumerate(savers) ] # print statement depends on whether or not there are adversaries for i in range(len(agent_rewards)): writer.add_scalar( 'agent_{}/mean_episode_reward'.format(i), np.mean(agent_rewards[i][-arglist.save_rate:]), len(episode_rewards)) if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): if not os.path.isdir(arglist.save_dir): os.makedirs(arglist.save_dir) if not os.path.isdir(arglist.benchmark_dir): os.makedirs(arglist.benchmark_dir) if not os.path.isdir(arglist.plots_dir): os.makedirs(arglist.plots_dir) #tensorboard summary_writer = tf.summary.FileWriter( "./" + arglist.exp_name + "_graph/", U.get_session().graph) reward_plot = None reward_summary = tf.Summary() reward_summary.value.add(tag='reward', simple_value=reward_plot) # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 """ #### USE RVO """ use_rvo_range = -1 # if want to use rvo, set 0.28 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] if use_rvo_range < 0: new_obs_n, rew_n, done_n, info_n = env.step(action_n, use_rvo=None) else: # use_rvo list total_rvo_list = [] for obs in obs_n: agent_pos = obs[-2 * (env.world.num_agents - 1)::] obst_pos = obs[-2 * (env.world.num_agents + env.world.num_obstacles)::] agent_rvo_list = [] for i in range(0, len(agent_pos), 2): if np.sqrt(np.sum(np.square( agent_pos[i:i + 2]))) < use_rvo_range: agent_rvo_list.append(True) else: agent_rvo_list.append(False) for i in range(0, len(obst_pos), 2): if np.sqrt(np.sum(np.square( obst_pos[i:i + 2]))) < use_rvo_range: agent_rvo_list.append(True) else: agent_rvo_list.append(False) if any(agent_rvo_list): total_rvo_list.append(True) else: total_rvo_list.append(False) # environment step new_obs_n, rew_n, done_n, info_n = env.step( action_n, use_rvo=total_rvo_list) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # add reward to tensorboard reward_summary.value[0].simple_value = np.mean( episode_rewards[-arglist.save_rate:]) summary_writer.add_summary(reward_summary, len(episode_rewards)) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() if terminal: # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) % 1000 == 0: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' + str( len(episode_rewards)) with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('saved') if len(episode_rewards) > arglist.num_episodes: print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) obs_n = env.reset() # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Pretrain the safety_layer safety_layer = None if arglist.use_safety_layer: safety_layer = SafetyLayer(env, len(env.world.landmarks) - 1, mlp_model_safety_layer, env.observation_space[0].shape, env.action_space, trainers[0].action) # set safety_layer for trainer[0] trainers[0].set_safety_layer(safety_layer) if arglist.use_mpc_layer: safety_layer = MpcLayer(env) # set safety_layer for trainer[0] trainers[0].set_safety_layer(safety_layer) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() episode_step = 0 train_step = 0 cumulative_constraint_violations = 0 t_start = time.time() data_save = [] num_done = 0 # pickle env # env0 = copy.deepcopy(env) '''file_path = open('env.pkl', 'rb') import pickle for i in range(len(env.world.landmarks)): env.world.landmarks[i] = pickle.load(file_path) for i in range(len(env.world.agents)): env.world.agents[i] = pickle.load(file_path) obs_n = [] agents = env.world.agents for agent in agents: obs_n.append(env._get_obs(agent))''' print('Starting iterations...') while True: # get constraint_values c_n = env.get_constraint_values() is_any_collision = env.is_any_collision() if is_any_collision[0]: cumulative_constraint_violations = cumulative_constraint_violations + 1 '''if c_n[0][0] > 0: print("there is a c_n > 0")''' # get action action_n = [ agent.action_real(obs, c, env) for agent, obs, c in zip(trainers, obs_n, c_n) ] action_real = [action_n[0][0]] if_call = [action_n[0][2]] action_n = [action_n[0][1]] data_save.append( np.concatenate([obs_n[0], action_n[0], action_n[0]])) # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n, if_call=if_call) '''is_any_collision_new = env.is_any_collision() if is_any_collision_new[0]: env.is_any_collision() dist = np.sqrt(np.sum(np.square(env.agents[0].state.p_pos - env.world.landmarks[0].state.p_pos))) -\ (env.agents[0].size + env.world.landmarks[0].size) # print("aaa", env.agents[0].state.p_pos, dist)''' # new c_n # new_c_n = env.get_constraint_values() episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) or \ (env.agents[0].state.p_pos[0] - env.world.landmarks[-1].state.p_pos[0]) > 1.5 # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: if done: num_done = num_done + 1 data_save.append( np.concatenate([obs_n[0], action_n[0], action_n[0]])) data_save = np.array(data_save) '''np.savetxt("data_save.txt", data_save)''' # 缺省按照'%.18e'格式保存数据,以空格分隔 # plot x, y, v, theta a = data_save V = a[:, 1] x = a[:, 2] y = a[:, 3] theta = a[:, 4] omega = a[:, 5] # action_n = a[:, 26] - a[:, 27] # action_real = a[:, 31] - a[:, 32] fig, ax0 = plt.subplots() for i, landmark in enumerate(env.world.landmarks[:-1]): p_pos = landmark.state.p_pos r = landmark.size circle = mpathes.Circle(p_pos, r, facecolor='w', edgecolor='forestgreen', linestyle='-.') ax0.add_patch(circle) for i, landmark in enumerate(env.world.landmarks): p_pos = landmark.state.p_pos r = (landmark.size - 0.09) if landmark is not env.world.landmarks[ -1] else landmark.size circle = mpathes.Circle(p_pos, r, facecolor='forestgreen') ax0.add_patch(circle) for i in range(len(x)): p_pos = np.array([x[i], y[i]]) r = env.world.agents[0].size circle = mpathes.Circle(p_pos, r, facecolor='darkgreen') ax0.add_patch(circle) ax0.set_xlim((-1, 40)) ax0.set_ylim((-10, 10)) ax0.axis('equal') ax0.set_title("x-y") x1 = [-1, 40] y1 = [10, 10] y2 = [-10, -10] ax0.plot(x1, y1, color='forestgreen', linestyle='-.') ax0.plot(x1, y2, color='forestgreen', linestyle='-.') plt.show() '''fig, ax = plt.subplots(ncols=2, nrows=2) for i, landmark in enumerate(env.world.landmarks): p_pos = landmark.state.p_pos r = landmark.size circle = mpathes.Circle(p_pos, r) ax[0, 0].add_patch(circle) for i in range(len(x)): p_pos = np.array([x[i], y[i]]) r = env.world.agents[0].size circle = mpathes.Circle(p_pos, r) ax[0, 0].add_patch(circle) ax[0, 0].set_xlim((-1, 20)) ax[0, 0].set_ylim((-10.3, 10.3)) ax[0, 0].set_title("x-y") ax[0, 0].axis('equal') ax[0, 1].plot(theta) ax[0, 1].set_title("theta") ax[1, 0].plot(omega) ax[1, 0].set_title("omega") # ax[1, 1].plot(action_n * 0.12) # ax[1, 1].set_title("action_n") plt.show()''' # reset and continue data_save = [] obs_n = env.reset() # env0 = copy.deepcopy(env) episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() '''for agent in trainers: loss = agent.update(trainers, train_step)''' # save model, display training output if (done or terminal) and ((len(episode_rewards) - 1) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}" .format(train_step, len(episode_rewards) - 1, np.mean(episode_rewards[-arglist.save_rate:]), cumulative_constraint_violations, num_done, round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}" .format(train_step, len(episode_rewards) - 1, np.mean(episode_rewards[-arglist.save_rate:]), cumulative_constraint_violations, num_done, round(time.time() - t_start, 3))) # print(trainers[0].safety_layer.num_call) t_start = time.time() num_done = 0 cumulative_constraint_violations = 0 # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark or arglist.plot: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() plot_data = [] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n plot_d = env.get_plot_data() for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew plot_data.append(plot_d) if done or terminal: if arglist.plot: if arglist.scenario == "simple_spread" or arglist.scenario == "simple_spread_obstacles": plot_spread(plot_data) if arglist.scenario == "simple_formation" or arglist.scenario == "simple_formation_obstacles": plot_formation(plot_data) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) plot_data = [] # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(episode_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(agent_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # create world world = World() # Create environment env = MultiAgentTorcsEnv(world, 0, world.reset_world, world.reward, world.observation, done_callback=world.done) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = env.adv #min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() #todo : call reset function here os.system("pkill torcs") os.system("cd ~/vtorcs3 && ./torcs &" ) #use the location of torcs installation on your system time.sleep(0.5) os.system('sh autostart.sh') time.sleep(1) obs_n = [] world.initialize_agents() for agent in env.agents: obs_n.append(world.observation(agent)) #obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() episode_count = 0 epsilon = 1 EXPLORE = 100000. train_indicator = 1 print('Starting iterations...') while True: print("Episode number: " + str(episode_count) + " ") # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step( action_n, epsilon, train_indicator) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() epsilon -= 1.0 / EXPLORE episode_step = 0 episode_rewards.append(0) episode_count += 1 for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 world.step = train_step # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue #NA for TORCS env # for displaying learned policies '''if arglist.display: time.sleep(0.1) env.render() continue''' # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) l2 = "Loss is " + str(loss) + "\n" with open("log2.txt", "a") as f: f.write(l2) print(l2) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break