def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.obs_shape = env_info["obs_shape"] # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "role_avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, "roles": { "vshape": (1, ), "group": "agents", "dtype": th.long } } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) while runner.t_env <= args.t_max: # Run for a whole episode at a time episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] #args.action_space = env_info["action_space"] args.action_spaces = env_info["action_spaces"] args.actions_dtype = env_info["actions_dtype"] args.normalise_actions = env_info.get("normalise_actions", False) # if true, action vectors need to sum to one # create function scaling agent action tensors to and from range [0,1] ttype = th.FloatTensor if not args.use_cuda else th.cuda.FloatTensor mult_coef_tensor = ttype(args.n_agents, args.n_actions) action_min_tensor = ttype(args.n_agents, args.n_actions) if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]): for _aid in range(args.n_agents): for _actid in range(args.action_spaces[_aid].shape[0]): _action_min = args.action_spaces[_aid].low[_actid] _action_max = args.action_spaces[_aid].high[_actid] mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min) action_min_tensor[_aid, _actid] = np.asscalar(_action_min) elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]): # NOTE: This was added to handle scenarios like simple_reference since the action space is Tuple for _aid in range(args.n_agents): for _actid in range(args.action_spaces[_aid].spaces[0].shape[0]): _action_min = args.action_spaces[_aid].spaces[0].low[_actid] _action_max = args.action_spaces[_aid].spaces[0].high[_actid] mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min) action_min_tensor[_aid, _actid] = np.asscalar(_action_min) for _actid in range(args.action_spaces[_aid].spaces[1].shape[0]): _action_min = args.action_spaces[_aid].spaces[1].low[_actid] _action_max = args.action_spaces[_aid].spaces[1].high[_actid] tmp_idx = _actid + args.action_spaces[_aid].spaces[0].shape[0] mult_coef_tensor[_aid, tmp_idx] = np.asscalar(_action_max - _action_min) action_min_tensor[_aid, tmp_idx] = np.asscalar(_action_min) args.actions2unit_coef = mult_coef_tensor args.actions2unit_coef_cpu = mult_coef_tensor.cpu() args.actions2unit_coef_numpy = mult_coef_tensor.cpu().numpy() args.actions_min = action_min_tensor args.actions_min_cpu = action_min_tensor.cpu() args.actions_min_numpy = action_min_tensor.cpu().numpy() def actions_to_unit_box(actions): if isinstance(actions, np.ndarray): return args.actions2unit_coef_numpy * actions + args.actions_min_numpy elif actions.is_cuda: return args.actions2unit_coef * actions + args.actions_min else: return args.args.actions2unit_coef_cpu * actions + args.actions_min_cpu def actions_from_unit_box(actions): if isinstance(actions, np.ndarray): return th.div((actions - args.actions_min_numpy), args.actions2unit_coef_numpy) elif actions.is_cuda: return th.div((actions - args.actions_min), args.actions2unit_coef) else: return th.div((actions - args.actions_min_cpu), args.actions2unit_coef_cpu) # make conversion functions globally available args.actions2unit = actions_to_unit_box args.unit2actions = actions_from_unit_box action_dtype = th.long if not args.actions_dtype == np.float32 else th.float if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]): actions_vshape = 1 if not args.actions_dtype == np.float32 else max([i.shape[0] for i in args.action_spaces]) elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]): actions_vshape = 1 if not args.actions_dtype == np.float32 else \ max([i.spaces[0].shape[0] + i.spaces[1].shape[0] for i in args.action_spaces]) # Default/Base scheme scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (actions_vshape,), "group": "agents", "dtype": action_dtype}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "reward": {"vshape": (1,)}, "terminated": {"vshape": (1,), "dtype": th.uint8}, } groups = { "agents": args.n_agents } if not args.actions_dtype == np.float32: preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } else: preprocess = {} buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1 if args.runner_scope == "episodic" else 2, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = - args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max)) while runner.t_env <= args.t_max: # Run for a whole episode at a time if getattr(args, "runner_scope", "episodic") == "episodic": episode_batch = runner.run(test_mode=False, learner=learner) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size) and (buffer.episodes_in_buffer > getattr(args, "buffer_warmup", 0)): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) elif getattr(args, "runner_scope", "episode") == "transition": runner.run(test_mode=False, buffer=buffer, learner=learner, episode=episode) else: raise Exception("Undefined runner scope!") # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env if getattr(args, "testing_on", True): for _ in range(n_test_runs): if getattr(args, "runner_scope", "episodic") == "episodic": runner.run(test_mode=True, learner=learner) elif getattr(args, "runner_scope", "episode") == "transition": runner.run(test_mode=True, buffer = buffer, learner = learner, episode = episode) else: raise Exception("Undefined runner scope!") if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states # learner.save_models(save_path, args.unique_token, model_save_time) learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.episode_limit = env_info["episode_limit"] args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.unit_dim = env_info["unit_dim"] # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } env_name = args.env if env_name == 'sc2': env_name += '/' + args.env_args['map_name'] buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, args.burn_in_period, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) if args.is_save_buffer: save_buffer = ReplayBuffer( scheme, groups, args.save_buffer_size, env_info["episode_limit"] + 1, args.burn_in_period, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) if args.is_batch_rl: assert (args.is_save_buffer == False) x_env_name = env_name if args.is_from_start: x_env_name += '_from_start/' path_name = '../../buffer/' + x_env_name + '/buffer_' + str( args.load_buffer_id) + '/' assert (os.path.exists(path_name) == True) buffer.load(path_name) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) if args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3' \ or args.env == 'mmdp_game_1': last_demo_T = -args.demo_interval - 1 while runner.t_env <= args.t_max: if not args.is_batch_rl: # Run for a whole episode at a time episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if args.is_save_buffer: save_buffer.insert_episode_batch(episode_batch) if save_buffer.is_from_start and save_buffer.episodes_in_buffer == save_buffer.buffer_size: save_buffer.is_from_start = False save_one_buffer(args, save_buffer, env_name, from_start=True) if save_buffer.buffer_index % args.save_buffer_interval == 0: print('current episodes_in_buffer: ', save_buffer.episodes_in_buffer) for _ in range(args.num_circle): if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) if args.is_batch_rl: runner.t_env += int( th.sum(episode_sample['filled']).cpu().clone().detach( ).numpy()) // args.batch_size # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) if args.env == 'mmdp_game_1' and args.learner == "q_learner_exp": for i in range(int(learner.target_gap) - 1): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.env == 'mmdp_game_1' and \ (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size): ### demo episode_sample = cp.deepcopy(buffer.sample(1)) for i in range(args.n_actions): for j in range(args.n_actions): new_actions = th.Tensor([i, j]).unsqueeze(0).repeat( args.episode_limit + 1, 1) if i == 0 and j == 0: rew = th.Tensor([ 1, ]) else: rew = th.Tensor([ 0, ]) if i == 1 and j == 1: new_obs = th.Tensor( [1, 0]).unsqueeze(0).unsqueeze(0).repeat( args.episode_limit, args.n_agents, 1) else: new_obs = th.Tensor( [0, 1]).unsqueeze(0).unsqueeze(0).repeat( args.episode_limit, args.n_agents, 1) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] episode_sample['actions'][0, :, :, 0] = new_actions episode_sample['obs'][0, 1:, :, :] = new_obs episode_sample['reward'][0, 0, 0] = rew new_actions_onehot = th.zeros( episode_sample['actions'].squeeze(3).shape + (args.n_actions, )) new_actions_onehot = new_actions_onehot.scatter_( 3, episode_sample['actions'].cpu(), 1) episode_sample['actions_onehot'][:] = new_actions_onehot if episode_sample.device != args.device: episode_sample.to(args.device) #print("action pair: %d, %d" % (i, j)) learner.train(episode_sample, runner.t_env, episode, show_demo=True, save_data=(i, j)) last_demo_T = runner.t_env #time.sleep(1) if (args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3') and \ (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size): ### demo episode_sample = cp.deepcopy(buffer.sample(1)) for i in range(args.n_actions): for j in range(args.n_actions): new_actions = th.Tensor([i, j]).unsqueeze(0).repeat( args.episode_limit + 1, 1) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] episode_sample['actions'][0, :, :, 0] = new_actions new_actions_onehot = th.zeros( episode_sample['actions'].squeeze(3).shape + (args.n_actions, )).cuda() new_actions_onehot = new_actions_onehot.scatter_( 3, episode_sample['actions'].cuda(), 1) episode_sample['actions_onehot'][:] = new_actions_onehot if i == 0 and j == 0: rew = th.Tensor([ 8, ]) elif i == 0 or j == 0: rew = th.Tensor([ -12, ]) else: rew = th.Tensor([ 0, ]) if args.env == 'matrix_game_3': if i == 1 and j == 1 or i == 2 and j == 2: rew = th.Tensor([ 6, ]) episode_sample['reward'][0, 0, 0] = rew if episode_sample.device != args.device: episode_sample.to(args.device) #print("action pair: %d, %d" % (i, j)) learner.train(episode_sample, runner.t_env, episode, show_demo=True, save_data=(i, j)) last_demo_T = runner.t_env #time.sleep(1) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) if args.double_q: os.makedirs(save_path + '_x', exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run * args.num_circle if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env if args.is_save_buffer and save_buffer.is_from_start: save_buffer.is_from_start = False save_one_buffer(args, save_buffer, env_name, from_start=True) runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.episode_limit = env_info["episode_limit"] # Default/Base scheme scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "reward": {"vshape": (1,)}, "terminated": {"vshape": (1,), "dtype": th.uint8}, "battle_won": {"vshape": (1,), "dtype": th.uint8}, } groups = { "agents": args.n_agents } preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device, save_episodes=True if args.save_episodes else False, episode_dir=args.episode_dir, clear_existing_episodes=args.clear_existing_episodes) # TODO maybe just pass args # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) # Model learner model_learner = None model_buffer = None if args.model_learner: model_learner = le_REGISTRY[args.model_learner](mac, scheme, logger, args) model_buffer = ReplayBuffer(scheme, groups, args.model_buffer_size, buffer.max_seq_length, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device, save_episodes=False) if args.use_cuda: learner.cuda() if model_learner: model_learner.cuda() if args.checkpoint_path != "": if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path)) return timestep_to_load = 0 if args.rl_checkpoint: rl_timesteps = [] # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers name = name.replace('rl_', '') if os.path.isdir(full_name) and name.isdigit(): rl_timesteps.append(int(name)) load_step = int(args.load_step.replace('rl_', '')) if isinstance(args.load_step, str) else args.load_step if load_step == 0: # choose the max timestep timestep_to_load = max(rl_timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(rl_timesteps, key=lambda x: abs(x - load_step)) model_path = os.path.join(args.checkpoint_path, f"rl_{timestep_to_load}") else: timesteps = [] # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner, buffer) return # TODO checkpoints for model_learner # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time # new stuff collect_episodes = True collected_episodes = 0 train_rl = False rl_iterations = 0 model_trained = False n_model_trained = 0 last_rl_T = 0 rl_model_save_time = 0 logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max)) while runner.t_env <= args.t_max: if model_learner: if collect_episodes: episode_batch = runner.run(test_mode=False) # collect real episode to progress t_env print(f"Collecting {args.batch_size_run} episodes from REAL ENV using epsilon: {runner.mac.env_action_selector.epsilon:.2f}, t_env: {runner.t_env}, collected episodes: {collected_episodes}") buffer.insert_episode_batch(episode_batch) collected_episodes += args.batch_size_run n_collect = args.model_n_collect_episodes if model_trained else args.model_n_collect_episodes_initial if collected_episodes >= n_collect: print(f"Collected {collected_episodes} REAL episodes, training ENV model") # stop collection and train model collect_episodes = False collected_episodes = 0 model_learner.train(buffer, runner.t_env, plot_test_results=False) model_trained = True n_model_trained += 1 train_rl = True if args.model_rollout_before_rl: print(f"Generating {args.model_rollouts} MODEL episodes") rollouts = 0 rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size) while rollouts < args.model_rollouts: model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations) model_buffer.insert_episode_batch(model_batch) rollouts += rollout_batch_size if train_rl: # and model_buffer.can_sample(args.batch_size): # generate synthetic episodes under current policy if not args.model_rollout_before_rl: print(f"Generating {args.model_rollouts} MODEL episodes") rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size) model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations) model_buffer.insert_episode_batch(model_batch) if model_buffer.can_sample(args.batch_size): for _ in range(args.model_rl_iterations_per_generated_sample): episode_sample = model_buffer.sample(args.batch_size) # truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) # train RL agent learner.train(episode_sample, runner.t_env, rl_iterations) rl_iterations += 1 print(f"Model RL iteration {rl_iterations}, t_env: {runner.t_env}") if not collect_episodes and rl_iterations > 0 and rl_iterations % args.model_update_interval == 0: if args.max_model_trained == 0 or args.max_model_trained and n_model_trained < args.max_model_trained: print(f"Time to update model") collect_episodes = True train_rl = False # update stats model_learner.log_stats(runner.t_env) if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("model_rl_iterations", rl_iterations, runner.t_env) if (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0): print(f"Logging rl stats") model_learner.log_rl_stats(rl_iterations) else: episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if args.save_episodes and args.save_policy_outputs and args.runner == "episode": mac.save_policy_outputs() if buffer.can_sample(args.batch_size): for _ in range(args.batch_size_run): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) rl_iterations += 1 print(f"RL iteration {rl_iterations}, t_env: {runner.t_env}") # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if ((runner.t_env - last_test_T) / args.test_interval >= 1.0) or (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0): print("Running test cases") logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env last_rl_T = rl_iterations runner.t_rl = rl_iterations for _ in range(n_test_runs): runner.run(test_mode=True) logger.print_recent_stats() if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) if args.save_model and model_trained and (rl_iterations == 0 or (rl_iterations - rl_model_save_time)/args.rl_save_model_interval >= 1.0): print(f"Saving at RL model iteration {rl_iterations}") rl_model_save_time = rl_iterations save_path = os.path.join(args.local_results_path, "models", args.unique_token, f"rl_{rl_iterations}") # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("rl_iterations", rl_iterations, runner.t_env) logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) to_index_flag = False if hasattr(args, 'to_index_flag'): if args.to_index_flag: to_index_flag = True # Set up schemes and groups here env_info = runner.get_env_info() # if args.disc_state: # if args.env_args["map_name"] == '3m': # state_num = 1077 # if to_index_flag: # pass # else: # state_shape = state_num # elif args.env_args["map_name"] == 'corridor': # state_num = 5280 # if to_index_flag: # pass # else: # state_shape = state_num # elif args.env_args["map_name"] == '6h_vs_8z': # state_num = 2884 # if to_index_flag: # state_shape = 62 # else: # state_shape = state_num # elif args.env_args["map_name"] == '2s3z': # state_num = 2325 # # state_num = 165 # if to_index_flag: # state_shape = 20 # else: # state_shape = state_num # else: # raise NotImplementedError # else: state_shape = env_info["state_shape"] state_num = env_info.get("state_num", None) # TEST # if args.env_args["map_name"] == '2s3z': # state_shape = 120 # state_num = state_shape # state_shape = env_info["state_shape"] # state_num = env_info.get("state_num", None) # state_num = state_shape args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = state_shape args.state_num = state_num args.all_obs = env_info.get("all_obs", None) # Default/Base scheme scheme = { "state": { "vshape": state_shape }, # "state": {"vshape": state_num}, # TEST "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, "noise": { "vshape": (args.noise_dim, ) } } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() runner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) # min_training_interval # training_interval_count = 0.0 # episode_limit = env_info["episode_limit"] last_train_T = -env_info["episode_limit"] - 1 # args.env_args.episode_limit # train_intervel_step = 0 training_times = 0 while runner.t_env <= args.t_max: # Run for a whole episode at a time time_stamp = time.time() episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) time_stamp = time_spent(time_stamp, 'Sampling') if buffer.can_sample(args.batch_size): if (runner.t_env - last_train_T) / env_info["episode_limit"] >= 0.9: episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps training_times += 1 logger.console_logger.info( "t_env: {} / training_times {}".format( runner.t_env, training_times)) # print('training_times', training_times) max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) time_stamp = time.time() learner.train(episode_sample, runner.t_env, episode) last_train_T = runner.t_env time_stamp = time_spent(time_stamp, 'Training') # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.noise_bandit: for _ in range(n_test_runs): runner.run(test_mode=True, test_uniform=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.results_path, "models", args.unique_token, str(runner.t_env)) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) runner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.save_model = True # 需要从外部设置 # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here # --------------------------------- # 设置 multi-agent controller # --------------------------------- mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() # ------------------------- # 如果 checkpoint_path 不为空,那么需要首先从 checkpoint_path 加载模型 # ------------------------- if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 # if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) # ---------------------------- # 从本地加载模型 # 1. 设置模型路径: args.checkpoint_path 对应 config/default.yaml 中的 checkpoint_path 配置项 # 2. 加载模型 # ---------------------------- model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load # ------------------------------ # 如果 default.yaml 中 cal_max_expectation_tasks 参数为 true, 表示需要使用已经训练好的最优模型来进行最大期望任务量的计算,而不进行模型的训练。 # ------------------------------ if args.cal_max_expectation_tasks: cal_max_expectation_tasks(args, mac, learner, runner) return if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) global_reward = [] global_state = [] file_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "train_reward.txt") state_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "train_state.txt") test_state = [] test_reward = [] test_state_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "test_state.txt") test_reward_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "test_reward.txt") while runner.t_env <= args.t_max: # t_env ? # Run for a whole episode at a time episode_batch = runner.run( test_mode=False) # runner.run() 返回的是一个回合的数据。 global_reward += get_episode_reward( episode_batch.data.transition_data) # 将每一个 step 的 reward 都记录下来 global_state += get_episode_state( episode_batch.data.transition_data) # 将每个 step 的 state 都记录下来 # 保存测试模式下的 state, reward 数据。 隔 args.reward_period 进行测试,测试的 state 数量为 args.reward_period。 if runner.t_env % args.reward_period == 0: print( "---------------------------------测试模式中-----------------------------------------" ) for i in range(int(args.reward_period / 20)): episode_data = runner.run(test_mode=True) # 执行测试模式 test_state += get_episode_state( episode_data.data.transition_data) test_reward += get_episode_reward( episode_data.data.transition_data) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() save_state_reward(state_path, global_state) save_state_reward(file_path, global_reward) save_state_reward(test_state_path, test_state) save_state_reward(test_reward_path, test_reward) logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Setup schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] # Default/base scheme reward_dict = {"vshape": (1,), "group": "agents", "dtype": th.float32} if args.env_args["reward_local"] else {"vshape": (1,)} scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "reward": reward_dict, "terminated": {"vshape": (1,), "dtype": th.uint8}, } # TODO: what is groups controlling? groups = { "agents": args.n_agents } # TODO: where/how is pre processing applied? preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } # TODO: why create replaybuffer with episode limit + 1? # Setup replaybuffer buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multi-agent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Setup runner with created scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Setup learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) # Activate CUDA if args.use_cuda: learner.cuda() # Load checkpoint if necessary if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 # Check checkpoint path integrity -> exist or else no model can be loaded later if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directory {} doesn't exist".format(args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) # TODO: enforce learner loading correct model? learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # # Start training # episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max)) while runner.t_env <= args.t_max: # Run for a whole episode at a time -> runner returns a episode batch episode_batch = runner.run(test_mode=False) # Save episode in replay buffer buffer.insert_episode_batch(episode_batch) # If enough episodes saved -> sample if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps # TODO: explain max_t_filled max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] # TODO: when is device differing?! if episode_sample.device != args.device: episode_sample.to(args.device) # Train on sampled episodes learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) # Save model after certain time if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) # Increase total episode counter by batch size of episodes currently run # TODO: follow batch_size_run! episode += args.batch_size_run # Log stats in interval if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): """ 真正运行函数 :param args: :type args: :param logger: :type logger: :return: :rtype: """ # init runner所以我们可以得到env info, 运行哪个runner,是src/runners/parallel_runner.py中的ParallelRunner 还是episode_runner.py runner = r_REGISTRY[args.runner](args=args, logger=logger) # 在此设置schemes和groups env_info = runner.get_env_info() # agent的数量 eg: 8 args.n_agents = env_info["n_agents"] # 动作的数量 eg: 6 args.n_actions = env_info["n_actions"] # agent状态的维度: 300 args.state_shape = env_info["state_shape"] if getattr(args, 'agent_own_state_size', False): args.agent_own_state_size = get_agent_own_state_size(args.env_args) # 自定义schema scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "probs": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.float}, "reward": {"vshape": (1,)}, "terminated": {"vshape": (1,), "dtype": th.uint8}, } groups = { "agents": args.n_agents } preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } # 重放buffer buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # 在此设置多agent控制器,调用src/controllers/n_controller.py中的NMAC函数 mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # 给runner这个schema runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner, 调用src/learners/nq_learner.py下的NQLearner初始化, 不同的算法初始化 learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": # 加载checkpoint,继续训练 timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path)) return # 遍历args.checkpoint_path中的所有文件 for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # 检查它们是否是Dirs的名称是数字 if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # 开始训练 episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("开始训练,训练的 {} 个时间步".format(args.t_max)) while runner.t_env <= args.t_max: # 一个时间步运行一个episode with th.no_grad(): episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # 截断批次只保留有时间步的 max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) del episode_sample # 执行测试运行一次 n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("完成训练")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) th.autograd.set_detect_anomaly(True) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.obs_shape = env_info["obs_shape"] # args.own_feature_size = env_info["own_feature_size"] #unit_type_bits+shield_bits_ally #if args.obs_last_action: # args.own_feature_size+=args.n_actions #if args.obs_agent_id: # args.own_feature_size+=args.n_agents # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, } if args.learner == "hierarchical_rode_learner": scheme.update({ "role_avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "roles": { "vshape": (1, ), "group": "agents", "dtype": th.long } }) if args.learner == "hierarchical_noise_q_learner": scheme.update({"noise": {"vshape": (args.noise_dim, )}}) groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here if args.q_net_ensemble: mac = [ mac_REGISTRY[args.mac](buffer.scheme, groups, args) for _ in range(args.ensemble_num) ] else: mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.runner == "meta_noise": runner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 if args.meta_h: last_meta_T = -args.meta_h_interval - 1 meta_buffer = ReplayBuffer( scheme, groups, args.batch_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) use_rode = True if args.learner == "hierarchical_rode_learner" else False meta_start_t = 0 if args.learner == "hierarchical_rode_learner": meta_start_t = args.role_action_spaces_update_start if args.save_batch_interval > 0: last_save_batch = -args.save_batch_interval - 1 whole_q_list = [] if args.save_q_all: q_list_ind = 0 while runner.t_env <= args.t_max: # Run for a whole episode at a time # if args.meta_h: # episode_batch, batch_log_p, mean_step_returns = runner.run(test_mode=False, meta_mode=True) # else: # episode_batch, _ = runner.run(test_mode=False) #[8,181,10,1] for actions episode_batch, _ = runner.run( test_mode=False, use_rode=use_rode) #[8,181,10,1] for actions buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size) and args.meta_h and \ (runner.t_env - last_meta_T) / args.meta_h_interval >= 1.0 and runner.t_env >= meta_start_t: repeat_times = args.batch_size // runner.batch_size # meta_buffer.insert_episode_batch(episode_batch) batch_log_p_all = [] mean_step_returns_all = [] for _ in range(repeat_times): #[8] # episode_batch, batch_log_p, mean_step_returns = runner.run_meta(test_mode=False, meta_mode=True) # batch_log_p_all.append(batch_log_p) episode_batch, _, mean_step_returns = runner.run_meta( test_mode=False, meta_mode=True, use_rode=use_rode) mean_step_returns_all += mean_step_returns buffer.insert_episode_batch(episode_batch[0]) meta_buffer.insert_episode_batch(episode_batch) #[32] # batch_log_p_all = th.cat(batch_log_p_all, dim=0) for _ in range(repeat_times): episode = prep_ep_and_train(meta_buffer, args, learner, episode, runner.t_env, whole_q_list) mean_step_returns_new_all = [] for _ in range(repeat_times): episode_batch_new, mean_step_returns_new = runner.run_meta( test_mode=False, use_rode=use_rode) buffer.insert_episode_batch(episode_batch_new[0]) mean_step_returns_new_all += mean_step_returns_new #need to get batch_log_p_here batch_log_p_all = runner.get_log_p(meta_buffer) learner.train_meta(batch_log_p_all, mean_step_returns_all, mean_step_returns_new_all, runner.t_env) for _ in range(repeat_times): episode = prep_ep_and_train(buffer, args, learner, episode, runner.t_env, whole_q_list) last_meta_T = runner.t_env elif buffer.can_sample(args.batch_size): prep_ep_and_train(buffer, args, learner, episode, runner.t_env, whole_q_list) # episode_sample = buffer.sample(args.batch_size) #[32,181,10,1] for actions # # Truncate batch to only filled timesteps # max_ep_t = episode_sample.max_t_filled() # episode_sample = episode_sample[:, :max_ep_t] # if episode_sample.device != args.device: # episode_sample.to(args.device) # learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env save_batch_flag = False discount = 1.0 if args.t_max // 5 <= runner.t_env else 10.0 if args.save_batch_interval > 0 and ( runner.t_env - last_save_batch) / ( args.save_batch_interval // discount) >= 1.0: save_batch_flag = True last_save_batch = runner.t_env for i in range(n_test_runs): if args.runner == "meta" or args.runner == "meta_noise": runner.run_meta(test_mode=True, use_rode=use_rode) else: runner.run(test_mode=True, use_rode=use_rode) if save_batch_flag: save_batch(runner.batch, osp.join(args.tb_logs, "batch"), runner.t_env, i) if args.noise_bandit: for _ in range(n_test_runs): runner.run_meta(test_mode=True, test_uniform=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if args.runner != "meta" and args.runner != "meta_noise" else 1 if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env if args.save_q_all and len(whole_q_list) >= 4000: save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind) whole_q_list.clear() q_list_ind += 1 if args.save_q_all and len(whole_q_list) > 0: save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind) runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner(episode runner or parallel runner) so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] # from smac maps args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] # args.unit_type_bits = env_info["unit_type_bits"] # args.shield_bits_ally = env_info["shield_bits_ally"] # args.shield_bits_enemy = env_info["shield_bits_enemy"] # args.n_enemies = env_info["n_enemies"] # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, #"policy": {"vshape": (env_info["n_agents"],)} } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) off_buffer = ReplayBuffer( scheme, groups, args.off_buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() runner.set_learner(learner) ###### If checkpoint_path is given, and if args.evaluate == True or args.save_replay == True, ###### then, this function is returned without training. if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( f"Checkpoint directiory {args.checkpoint_path} doesn't exist") return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info(f"Loading model from {model_path}") learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return ######################################################################################################## ######## start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info( f"Beginning training for {args.t_max} timesteps") while runner.t_env <= args.t_max: # critic running log running_log = { "critic_loss": [], "critic_grad_norm": [], "td_error_abs": [], "target_mean": [], "q_taken_mean": [], "q_max_mean": [], "q_min_mean": [], "q_max_var": [], "q_min_var": [] } # Run for a whole episode at a time episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) off_buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size) and off_buffer.can_sample( args.off_batch_size): #train critic normall uni_episode_sample = buffer.uni_sample(args.batch_size) off_episode_sample = off_buffer.uni_sample(args.off_batch_size) max_ep_t = max(uni_episode_sample.max_t_filled(), off_episode_sample.max_t_filled()) uni_episode_sample = process_batch( uni_episode_sample[:, :max_ep_t], args) off_episode_sample = process_batch( off_episode_sample[:, :max_ep_t], args) learner.train_critic(uni_episode_sample, best_batch=off_episode_sample, log=running_log) #train actor episode_sample = buffer.sample_latest(args.batch_size) max_ep_t = episode_sample.max_t_filled() episode_sample = process_batch(episode_sample[:, :max_ep_t], args) learner.train(episode_sample, runner.t_env, running_log) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info(f"t_env: {runner.t_env} / {args.t_max}") logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) # Save model every {save_model_interval} timesteps if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info(f"Saving models to {save_path}") # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")