def main(_config, _run): config = convert(_config) _id = _run._id # Logging stuff logger = logging.getLogger("Main") if config.mongo: logging.disable(logging.WARNING) configure_stats_logging( str(_id) + "_" + config.name, log_interval=config.log_interval, sacred_info=_run.info, use_tb=config.tb, ) stats = get_stats() logger.critical("ID: {}".format(_id)) # Update config with environment specific information env = gym.make(config.env) num_actions = env.action_space.n config = config._replace(num_actions=num_actions) state_shape = env.observation_space.shape config = config._replace(state_shape=state_shape) # Wrap env env = EnvWrapper(env, debug=True, args=config) # Log the config config_str = "Config:\n\n" for k, v in sorted(config._asdict().items()): config_str += " {}: {}\n".format(k, v) logger.critical(config_str) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.critical("Device: {}".format(device.type)) # Make agent and target agent agent = get_model(config.agent)(config) target_agent = get_model(config.agent)(config) target_agent.load_state_dict(agent.state_dict()) agent.to(device) target_agent.to(device) # Pseudocount stuff count_model = None if config.count_rewards: if config.atari_count: count_model = AtariCount(config) elif config.rnd_net_count: # assert config.count_state_only_rewards count_model = RndNetworkDistill(config, device) elif config.dora_count: count_model = DoraCount(config, device) else: count_model = PseudoCount(config) # Make action selector action_selector = None if config.action_selector == "eps_greedy": action_selector = eps_greedy.EpsGreedy(config) elif config.action_selector == "optimistic_action": action_selector = optimistic_action.OptimisticAction( count_model, config) elif config.action_selector == "bsp": action_selector = bsp_action.BSPAction(config) else: raise Exception("{} is not an Action Selector!".format( config.action_selector)) # Make replay buffer # Check if the obs dtype of the environment is an int obs_dtype = getattr(env.wrapped_env, "obs_dtype", np.float32) obs_scaling = getattr(env.wrapped_env, "obs_scaling", 1) replay_buffer = ReplayBuffer(size=config.buffer_size, frame_history_len=config.past_frames_input, obs_dtype=obs_dtype, obs_scaling=obs_scaling, args=config) if config.dora_count: dora_buffer = ReplayBuffer(size=config.batch_size * 4, frame_history_len=config.past_frames_input, obs_dtype=obs_dtype, obs_scaling=obs_scaling, args=config) # Make trainer trainer = None if config.trainer == "DQN": trainer = DQNTrainer(agent=agent, target_agent=target_agent, args=config, count_model=count_model, buffer=replay_buffer) else: raise Exception testing_buffer = ReplayBuffer(size=(config.past_frames_input + 1), frame_history_len=config.past_frames_input, args=config) # Testing stuff testing_env = EnvWrapper(env=gym.make(config.env), debug=True, args=config) if config.test_augmented: assert config.action_selector == "optimistic_action" # Player Positions positions = set() action_positions = set() T = 0 start_time = time.time() last_time = start_time # Lots of code duplication :( logging.critical("Filling buffer with {:,} random experiences.".format( config.buffer_burn_in)) state = env.reset() assert config.buffer_burn_in == 0 for t in range(config.buffer_burn_in): buffer_idx = replay_buffer.store_frame(state) stacked_states = replay_buffer.encode_recent_observation() tensor_state = torch.tensor(stacked_states, device=device).unsqueeze(0) action = np.random.randint(config.num_actions) next_state, reward, terminated, info = env.step(action) terminal_to_store = terminated if "Steps_Termination" in info and info["Steps_Termination"]: terminal_to_store = False intrinsic_reward = 0 pseudo_count = 0 if config.count_rewards: pseudo_count = count_model.visit(tensor_state, action) if getattr(count_model, "reward_directly", False): intrinsic_reward = pseudo_count else: count_bonus = config.count_beta / sqrt(pseudo_count) intrinsic_reward = count_bonus replay_buffer.store_effect(buffer_idx, action, reward - config.reward_baseline, intrinsic_reward, terminal_to_store, pseudo_count) state = next_state if terminated: state = env.reset() logger.warning("Random action burn in t: {:,}".format(t)) state = env.reset() episode = 0 episode_reward = 0 intrinsic_episode_reward = 0 episode_length = 0 env_positive_reward = 0 max_episode_reward = 0 if config.bsp: bsp_k = np.random.randint(config.bsp_k) action_selector.update_k(bsp_k) logging.critical("Beginning training.") while T < config.t_max: # Store the current state buffer_idx = replay_buffer.store_frame(state) if config.dora_count: dora_idx = dora_buffer.store_frame(state) # Get the stacked input vector stacked_states = replay_buffer.encode_recent_observation() # Get output from agent with torch.no_grad(): tensor_state = torch.tensor(stacked_states, device=device).unsqueeze(0) agent_output = agent(tensor_state) # agent_output = agent(torch.Tensor(stacked_states).unsqueeze(0)) # Select action action, action_info = action_selector.select_actions( agent_output, T, info={"state": tensor_state}) # Take an environment step next_state, reward, terminated, info = env.step(action) T += 1 stats.update_t(T) episode_reward += reward episode_length += 1 terminal_to_store = terminated if "Steps_Termination" in info and info["Steps_Termination"]: logger.warning("Terminating because of episode limit") terminal_to_store = False # Log if a positive reward was ever received from environment. ~Finding goal if reward > 0.1: env_positive_reward = 1 stats.update_stats("Positive_Reward", env_positive_reward) # Calculate count based intrinsic motivation intrinsic_reward = 0 pseudo_count = 0 if config.count_rewards: pseudo_count = count_model.visit(tensor_state, action) if getattr(count_model, "reward_directly", False): # The count-model is giving us the intrinsic reward directly intrinsic_reward = pseudo_count[0] else: # Count-model is giving us the pseudo-count count_bonus = config.count_beta / sqrt(pseudo_count) intrinsic_reward = count_bonus intrinsic_episode_reward += intrinsic_reward # Render training if config.render_train_env: debug_info = {} debug_info.update(action_info) env.render(debug_info=debug_info) # Add what happened to the buffer replay_buffer.store_effect(buffer_idx, action, reward - config.reward_baseline, intrinsic_reward, terminal_to_store, pseudo_count) if config.dora_count: dora_buffer.store_effect(dora_idx, action, reward - config.reward_baseline, intrinsic_reward, terminal_to_store, pseudo_count) # Update state state = next_state # If terminated if terminated: # If we terminated due to episode limit, we need to add the current state in if "Steps_Termination" in info and info["Steps_Termination"]: buffer_idx = replay_buffer.store_frame(state) replay_buffer.store_effect(buffer_idx, 0, 0, 0, True, 0, dont_sample=True) if config.dora_count: dora_idx = dora_buffer.store_frame(state) dora_buffer.store_effect(dora_idx, 0, 0, 0, True, 0, dont_sample=True) logger.warning("T: {:,}, Episode Reward: {:.2f}".format( T, episode_reward)) state = env.reset() max_episode_reward = max(max_episode_reward, episode_reward) stats.update_stats("Episode Reward", episode_reward) stats.update_stats("Max Episode Reward", max_episode_reward) stats.update_stats("Episode Length", episode_length) stats.update_stats("Intrin Eps Reward", intrinsic_episode_reward) episode_reward = 0 episode_length = 0 intrinsic_episode_reward = 0 episode += 1 stats.update_stats("Episode", episode) if config.bsp: bsp_k = np.random.randint(config.bsp_k) action_selector.update_k(bsp_k) # Train if possible for _ in range(config.training_iters): sampled_batch = None if T % config.update_freq != 0: # Only train every update_freq timesteps continue if replay_buffer.can_sample(config.batch_size): sampled_batch = replay_buffer.sample(config.batch_size, nstep=config.n_step) if sampled_batch is not None: trainer.train(sampled_batch) if config.dora_count: if dora_buffer.can_sample(config.batch_size): sampled_batch = replay_buffer.sample(config.batch_size, nstep=config.n_step) if sampled_batch is not None: count_model.train(sampled_batch) # Update target networks if necessary if T % config.target_update_interval == 0: trainer.update_target_agent() if config.dora_count: count_model.update_target_agent() # Logging if config.bsp: agent_output = agent_output[:, :, bsp_k] q_vals_numpy = agent_output.detach().cpu()[0].numpy() if num_actions < 20: for action_id in range(config.num_actions): stats.update_stats("Q-Value_{}".format(action_id), q_vals_numpy[action_id]) else: stats.update_stats("Q-Value_Mean", np.mean(q_vals_numpy)) player_pos = env.log_visitation() positions.add(player_pos) action_positions.add((player_pos, action)) stats.update_stats("States Visited", len(positions)) stats.update_stats("State_Actions Visited", len(action_positions)) stats.update_stats("Player Position", player_pos) # Log all env stats returned for k, v in info.items(): if k != "Steps_Termination": stats.update_stats(k, v) if config.save_count_gifs > 0 and T % config.save_count_gifs == 0: if count_model is not None: state_action_counts, count_nums = env.count_state_action_space( count_model) if state_action_counts is not None: save_image(state_action_counts, image_name="SA_Counts__{}_Size__{}_T".format( config.count_size, T), direc_name="State_Action_Counts") save_sa_count_vals(count_nums, name="SA_PCounts__{}_Size__{}_T".format( config.count_size, T), direc_name="Sa_Count_Estimates") actual_counts = env.state_counts() if actual_counts is not None: save_actual_counts(actual_counts, name="Counts__{}_T".format(T), direc_name="Actual_Counts") q_val_img, q_vals = env.q_value_estimates(count_model, agent) if q_val_img is not None: save_image(q_val_img, image_name="Q_Vals__{}_Size__{}_T".format( config.count_size, T), direc_name="Q_Value_Estimates") if q_vals is not None: save_q_vals(q_vals, name="Q_Vals__{}_Size__{}_T".format( config.count_size, T), direc_name="Q_Value_Estimates") # Testing with torch.no_grad(): if T % config.testing_interval == 0: prefixes = [""] if config.test_augmented: prefixes += ["Aug_"] for prefix in prefixes: total_test_reward = 0 total_test_length = 0 for _ in range(config.test_episodes): test_episode_reward = 0 test_episode_length = 0 test_state = testing_env.reset() test_env_terminated = False while not test_env_terminated: test_buffer_idx = testing_buffer.store_frame( test_state) stacked_test_states = testing_buffer.encode_recent_observation( ) test_tensor_state = torch.tensor( stacked_test_states, device=device).unsqueeze(0) testing_agent_output = agent(test_tensor_state) if prefix == "Aug_" or config.bsp: test_action, _ = action_selector.select_actions( testing_agent_output, T, info={"state": test_tensor_state}, testing=True) else: test_action = get_test_action( testing_agent_output, config) next_test_state, test_reward, test_env_terminated, _ = testing_env.step( test_action) if config.render_test_env: testing_env.render() test_episode_length += 1 test_episode_reward += test_reward testing_buffer.store_effect( test_buffer_idx, test_action, test_reward, 0, test_env_terminated, 0) test_state = next_test_state total_test_length += test_episode_length total_test_reward += test_episode_reward mean_test_reward = total_test_reward / config.test_episodes mean_test_length = total_test_length / config.test_episodes logger.error( "{}Testing -- T: {:,}/{:,}, Test Reward: {:.2f}, Test Length: {:,}" .format(prefix, T, config.t_max, mean_test_reward, mean_test_length)) stats.update_stats("{}Test Reward".format(prefix), mean_test_reward, always_log=True) stats.update_stats("{}Test Episode Length".format(prefix), mean_test_length, always_log=True) logger.error("Estimated time left: {}. Time passed: {}".format( time_left(last_time, T - config.testing_interval, T, config.t_max), time_str(time.time() - start_time))) last_time = time.time() if T % (config.log_interval * 4) == 0: stats.print_stats() logger.critical("Closing envs") env.close() testing_env.close() logger.critical("Finished training.") if client is not None: logger.critical("Attempting to close pymongo client") client.close() logger.critical("Pymongo client closed") logger.critical("Exiting")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.obs_shape = env_info["obs_shape"] # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "role_avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, "roles": { "vshape": (1, ), "group": "agents", "dtype": th.long } } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) while runner.t_env <= args.t_max: # Run for a whole episode at a time episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] #args.action_space = env_info["action_space"] args.action_spaces = env_info["action_spaces"] args.actions_dtype = env_info["actions_dtype"] args.normalise_actions = env_info.get("normalise_actions", False) # if true, action vectors need to sum to one # create function scaling agent action tensors to and from range [0,1] ttype = th.FloatTensor if not args.use_cuda else th.cuda.FloatTensor mult_coef_tensor = ttype(args.n_agents, args.n_actions) action_min_tensor = ttype(args.n_agents, args.n_actions) if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]): for _aid in range(args.n_agents): for _actid in range(args.action_spaces[_aid].shape[0]): _action_min = args.action_spaces[_aid].low[_actid] _action_max = args.action_spaces[_aid].high[_actid] mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min) action_min_tensor[_aid, _actid] = np.asscalar(_action_min) elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]): # NOTE: This was added to handle scenarios like simple_reference since the action space is Tuple for _aid in range(args.n_agents): for _actid in range(args.action_spaces[_aid].spaces[0].shape[0]): _action_min = args.action_spaces[_aid].spaces[0].low[_actid] _action_max = args.action_spaces[_aid].spaces[0].high[_actid] mult_coef_tensor[_aid, _actid] = np.asscalar(_action_max - _action_min) action_min_tensor[_aid, _actid] = np.asscalar(_action_min) for _actid in range(args.action_spaces[_aid].spaces[1].shape[0]): _action_min = args.action_spaces[_aid].spaces[1].low[_actid] _action_max = args.action_spaces[_aid].spaces[1].high[_actid] tmp_idx = _actid + args.action_spaces[_aid].spaces[0].shape[0] mult_coef_tensor[_aid, tmp_idx] = np.asscalar(_action_max - _action_min) action_min_tensor[_aid, tmp_idx] = np.asscalar(_action_min) args.actions2unit_coef = mult_coef_tensor args.actions2unit_coef_cpu = mult_coef_tensor.cpu() args.actions2unit_coef_numpy = mult_coef_tensor.cpu().numpy() args.actions_min = action_min_tensor args.actions_min_cpu = action_min_tensor.cpu() args.actions_min_numpy = action_min_tensor.cpu().numpy() def actions_to_unit_box(actions): if isinstance(actions, np.ndarray): return args.actions2unit_coef_numpy * actions + args.actions_min_numpy elif actions.is_cuda: return args.actions2unit_coef * actions + args.actions_min else: return args.args.actions2unit_coef_cpu * actions + args.actions_min_cpu def actions_from_unit_box(actions): if isinstance(actions, np.ndarray): return th.div((actions - args.actions_min_numpy), args.actions2unit_coef_numpy) elif actions.is_cuda: return th.div((actions - args.actions_min), args.actions2unit_coef) else: return th.div((actions - args.actions_min_cpu), args.actions2unit_coef_cpu) # make conversion functions globally available args.actions2unit = actions_to_unit_box args.unit2actions = actions_from_unit_box action_dtype = th.long if not args.actions_dtype == np.float32 else th.float if all([isinstance(act_space, spaces.Box) for act_space in args.action_spaces]): actions_vshape = 1 if not args.actions_dtype == np.float32 else max([i.shape[0] for i in args.action_spaces]) elif all([isinstance(act_space, spaces.Tuple) for act_space in args.action_spaces]): actions_vshape = 1 if not args.actions_dtype == np.float32 else \ max([i.spaces[0].shape[0] + i.spaces[1].shape[0] for i in args.action_spaces]) # Default/Base scheme scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (actions_vshape,), "group": "agents", "dtype": action_dtype}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "reward": {"vshape": (1,)}, "terminated": {"vshape": (1,), "dtype": th.uint8}, } groups = { "agents": args.n_agents } if not args.actions_dtype == np.float32: preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } else: preprocess = {} buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1 if args.runner_scope == "episodic" else 2, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = - args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max)) while runner.t_env <= args.t_max: # Run for a whole episode at a time if getattr(args, "runner_scope", "episodic") == "episodic": episode_batch = runner.run(test_mode=False, learner=learner) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size) and (buffer.episodes_in_buffer > getattr(args, "buffer_warmup", 0)): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) elif getattr(args, "runner_scope", "episode") == "transition": runner.run(test_mode=False, buffer=buffer, learner=learner, episode=episode) else: raise Exception("Undefined runner scope!") # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env if getattr(args, "testing_on", True): for _ in range(n_test_runs): if getattr(args, "runner_scope", "episodic") == "episodic": runner.run(test_mode=True, learner=learner) elif getattr(args, "runner_scope", "episode") == "transition": runner.run(test_mode=True, buffer = buffer, learner = learner, episode = episode) else: raise Exception("Undefined runner scope!") if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states # learner.save_models(save_path, args.unique_token, model_save_time) learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.episode_limit = env_info["episode_limit"] args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.unit_dim = env_info["unit_dim"] # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } env_name = args.env if env_name == 'sc2': env_name += '/' + args.env_args['map_name'] buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, args.burn_in_period, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) if args.is_save_buffer: save_buffer = ReplayBuffer( scheme, groups, args.save_buffer_size, env_info["episode_limit"] + 1, args.burn_in_period, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) if args.is_batch_rl: assert (args.is_save_buffer == False) x_env_name = env_name if args.is_from_start: x_env_name += '_from_start/' path_name = '../../buffer/' + x_env_name + '/buffer_' + str( args.load_buffer_id) + '/' assert (os.path.exists(path_name) == True) buffer.load(path_name) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) if args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3' \ or args.env == 'mmdp_game_1': last_demo_T = -args.demo_interval - 1 while runner.t_env <= args.t_max: if not args.is_batch_rl: # Run for a whole episode at a time episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if args.is_save_buffer: save_buffer.insert_episode_batch(episode_batch) if save_buffer.is_from_start and save_buffer.episodes_in_buffer == save_buffer.buffer_size: save_buffer.is_from_start = False save_one_buffer(args, save_buffer, env_name, from_start=True) if save_buffer.buffer_index % args.save_buffer_interval == 0: print('current episodes_in_buffer: ', save_buffer.episodes_in_buffer) for _ in range(args.num_circle): if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) if args.is_batch_rl: runner.t_env += int( th.sum(episode_sample['filled']).cpu().clone().detach( ).numpy()) // args.batch_size # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) if args.env == 'mmdp_game_1' and args.learner == "q_learner_exp": for i in range(int(learner.target_gap) - 1): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.env == 'mmdp_game_1' and \ (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size): ### demo episode_sample = cp.deepcopy(buffer.sample(1)) for i in range(args.n_actions): for j in range(args.n_actions): new_actions = th.Tensor([i, j]).unsqueeze(0).repeat( args.episode_limit + 1, 1) if i == 0 and j == 0: rew = th.Tensor([ 1, ]) else: rew = th.Tensor([ 0, ]) if i == 1 and j == 1: new_obs = th.Tensor( [1, 0]).unsqueeze(0).unsqueeze(0).repeat( args.episode_limit, args.n_agents, 1) else: new_obs = th.Tensor( [0, 1]).unsqueeze(0).unsqueeze(0).repeat( args.episode_limit, args.n_agents, 1) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] episode_sample['actions'][0, :, :, 0] = new_actions episode_sample['obs'][0, 1:, :, :] = new_obs episode_sample['reward'][0, 0, 0] = rew new_actions_onehot = th.zeros( episode_sample['actions'].squeeze(3).shape + (args.n_actions, )) new_actions_onehot = new_actions_onehot.scatter_( 3, episode_sample['actions'].cpu(), 1) episode_sample['actions_onehot'][:] = new_actions_onehot if episode_sample.device != args.device: episode_sample.to(args.device) #print("action pair: %d, %d" % (i, j)) learner.train(episode_sample, runner.t_env, episode, show_demo=True, save_data=(i, j)) last_demo_T = runner.t_env #time.sleep(1) if (args.env == 'matrix_game_1' or args.env == 'matrix_game_2' or args.env == 'matrix_game_3') and \ (runner.t_env - last_demo_T) / args.demo_interval >= 1.0 and buffer.can_sample(args.batch_size): ### demo episode_sample = cp.deepcopy(buffer.sample(1)) for i in range(args.n_actions): for j in range(args.n_actions): new_actions = th.Tensor([i, j]).unsqueeze(0).repeat( args.episode_limit + 1, 1) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] episode_sample['actions'][0, :, :, 0] = new_actions new_actions_onehot = th.zeros( episode_sample['actions'].squeeze(3).shape + (args.n_actions, )).cuda() new_actions_onehot = new_actions_onehot.scatter_( 3, episode_sample['actions'].cuda(), 1) episode_sample['actions_onehot'][:] = new_actions_onehot if i == 0 and j == 0: rew = th.Tensor([ 8, ]) elif i == 0 or j == 0: rew = th.Tensor([ -12, ]) else: rew = th.Tensor([ 0, ]) if args.env == 'matrix_game_3': if i == 1 and j == 1 or i == 2 and j == 2: rew = th.Tensor([ 6, ]) episode_sample['reward'][0, 0, 0] = rew if episode_sample.device != args.device: episode_sample.to(args.device) #print("action pair: %d, %d" % (i, j)) learner.train(episode_sample, runner.t_env, episode, show_demo=True, save_data=(i, j)) last_demo_T = runner.t_env #time.sleep(1) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) if args.double_q: os.makedirs(save_path + '_x', exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run * args.num_circle if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env if args.is_save_buffer and save_buffer.is_from_start: save_buffer.is_from_start = False save_one_buffer(args, save_buffer, env_name, from_start=True) runner.close_env() logger.console_logger.info("Finished Training")
def train(args, logger, learner, runner, buffer, engine_configuration_channel): episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) while runner.t_env <= args.t_max: engine_configuration_channel.set_configuration_parameters( time_scale=args.learning_time_scale) episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) # coordinator를 학습 합니다. learner.train(episode_sample, runner.t_env, episode) n_test_runs = max(1, args.test_nepisode // runner.batch_size) # 일정 주기로 Test를 진행 합니다. if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env engine_configuration_channel.set_configuration_parameters( time_scale=args.test_time_scale) for _ in range(n_test_runs): runner.run(test_mode=True) # 일정 주기로 학습된 가중치를 저장 합니다. if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) to_index_flag = False if hasattr(args, 'to_index_flag'): if args.to_index_flag: to_index_flag = True # Set up schemes and groups here env_info = runner.get_env_info() # if args.disc_state: # if args.env_args["map_name"] == '3m': # state_num = 1077 # if to_index_flag: # pass # else: # state_shape = state_num # elif args.env_args["map_name"] == 'corridor': # state_num = 5280 # if to_index_flag: # pass # else: # state_shape = state_num # elif args.env_args["map_name"] == '6h_vs_8z': # state_num = 2884 # if to_index_flag: # state_shape = 62 # else: # state_shape = state_num # elif args.env_args["map_name"] == '2s3z': # state_num = 2325 # # state_num = 165 # if to_index_flag: # state_shape = 20 # else: # state_shape = state_num # else: # raise NotImplementedError # else: state_shape = env_info["state_shape"] state_num = env_info.get("state_num", None) # TEST # if args.env_args["map_name"] == '2s3z': # state_shape = 120 # state_num = state_shape # state_shape = env_info["state_shape"] # state_num = env_info.get("state_num", None) # state_num = state_shape args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = state_shape args.state_num = state_num args.all_obs = env_info.get("all_obs", None) # Default/Base scheme scheme = { "state": { "vshape": state_shape }, # "state": {"vshape": state_num}, # TEST "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, "noise": { "vshape": (args.noise_dim, ) } } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() runner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) # min_training_interval # training_interval_count = 0.0 # episode_limit = env_info["episode_limit"] last_train_T = -env_info["episode_limit"] - 1 # args.env_args.episode_limit # train_intervel_step = 0 training_times = 0 while runner.t_env <= args.t_max: # Run for a whole episode at a time time_stamp = time.time() episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) time_stamp = time_spent(time_stamp, 'Sampling') if buffer.can_sample(args.batch_size): if (runner.t_env - last_train_T) / env_info["episode_limit"] >= 0.9: episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps training_times += 1 logger.console_logger.info( "t_env: {} / training_times {}".format( runner.t_env, training_times)) # print('training_times', training_times) max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) time_stamp = time.time() learner.train(episode_sample, runner.t_env, episode) last_train_T = runner.t_env time_stamp = time_spent(time_stamp, 'Training') # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.noise_bandit: for _ in range(n_test_runs): runner.run(test_mode=True, test_uniform=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.results_path, "models", args.unique_token, str(runner.t_env)) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) runner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.save_model = True # 需要从外部设置 # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here # --------------------------------- # 设置 multi-agent controller # --------------------------------- mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() # ------------------------- # 如果 checkpoint_path 不为空,那么需要首先从 checkpoint_path 加载模型 # ------------------------- if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 # if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) # ---------------------------- # 从本地加载模型 # 1. 设置模型路径: args.checkpoint_path 对应 config/default.yaml 中的 checkpoint_path 配置项 # 2. 加载模型 # ---------------------------- model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load # ------------------------------ # 如果 default.yaml 中 cal_max_expectation_tasks 参数为 true, 表示需要使用已经训练好的最优模型来进行最大期望任务量的计算,而不进行模型的训练。 # ------------------------------ if args.cal_max_expectation_tasks: cal_max_expectation_tasks(args, mac, learner, runner) return if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) global_reward = [] global_state = [] file_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "train_reward.txt") state_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "train_state.txt") test_state = [] test_reward = [] test_state_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "test_state.txt") test_reward_path = os.path.join(os.path.dirname(__file__), "envs", "ec", "output", "test_reward.txt") while runner.t_env <= args.t_max: # t_env ? # Run for a whole episode at a time episode_batch = runner.run( test_mode=False) # runner.run() 返回的是一个回合的数据。 global_reward += get_episode_reward( episode_batch.data.transition_data) # 将每一个 step 的 reward 都记录下来 global_state += get_episode_state( episode_batch.data.transition_data) # 将每个 step 的 state 都记录下来 # 保存测试模式下的 state, reward 数据。 隔 args.reward_period 进行测试,测试的 state 数量为 args.reward_period。 if runner.t_env % args.reward_period == 0: print( "---------------------------------测试模式中-----------------------------------------" ) for i in range(int(args.reward_period / 20)): episode_data = runner.run(test_mode=True) # 执行测试模式 test_state += get_episode_state( episode_data.data.transition_data) test_reward += get_episode_reward( episode_data.data.transition_data) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() save_state_reward(state_path, global_state) save_state_reward(file_path, global_reward) save_state_reward(test_state_path, test_state) save_state_reward(test_reward_path, test_reward) logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.episode_limit = env_info["episode_limit"] # Default/Base scheme scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "reward": {"vshape": (1,)}, "terminated": {"vshape": (1,), "dtype": th.uint8}, "battle_won": {"vshape": (1,), "dtype": th.uint8}, } groups = { "agents": args.n_agents } preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device, save_episodes=True if args.save_episodes else False, episode_dir=args.episode_dir, clear_existing_episodes=args.clear_existing_episodes) # TODO maybe just pass args # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) # Model learner model_learner = None model_buffer = None if args.model_learner: model_learner = le_REGISTRY[args.model_learner](mac, scheme, logger, args) model_buffer = ReplayBuffer(scheme, groups, args.model_buffer_size, buffer.max_seq_length, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device, save_episodes=False) if args.use_cuda: learner.cuda() if model_learner: model_learner.cuda() if args.checkpoint_path != "": if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path)) return timestep_to_load = 0 if args.rl_checkpoint: rl_timesteps = [] # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers name = name.replace('rl_', '') if os.path.isdir(full_name) and name.isdigit(): rl_timesteps.append(int(name)) load_step = int(args.load_step.replace('rl_', '')) if isinstance(args.load_step, str) else args.load_step if load_step == 0: # choose the max timestep timestep_to_load = max(rl_timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(rl_timesteps, key=lambda x: abs(x - load_step)) model_path = os.path.join(args.checkpoint_path, f"rl_{timestep_to_load}") else: timesteps = [] # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner, buffer) return # TODO checkpoints for model_learner # start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time # new stuff collect_episodes = True collected_episodes = 0 train_rl = False rl_iterations = 0 model_trained = False n_model_trained = 0 last_rl_T = 0 rl_model_save_time = 0 logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max)) while runner.t_env <= args.t_max: if model_learner: if collect_episodes: episode_batch = runner.run(test_mode=False) # collect real episode to progress t_env print(f"Collecting {args.batch_size_run} episodes from REAL ENV using epsilon: {runner.mac.env_action_selector.epsilon:.2f}, t_env: {runner.t_env}, collected episodes: {collected_episodes}") buffer.insert_episode_batch(episode_batch) collected_episodes += args.batch_size_run n_collect = args.model_n_collect_episodes if model_trained else args.model_n_collect_episodes_initial if collected_episodes >= n_collect: print(f"Collected {collected_episodes} REAL episodes, training ENV model") # stop collection and train model collect_episodes = False collected_episodes = 0 model_learner.train(buffer, runner.t_env, plot_test_results=False) model_trained = True n_model_trained += 1 train_rl = True if args.model_rollout_before_rl: print(f"Generating {args.model_rollouts} MODEL episodes") rollouts = 0 rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size) while rollouts < args.model_rollouts: model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations) model_buffer.insert_episode_batch(model_batch) rollouts += rollout_batch_size if train_rl: # and model_buffer.can_sample(args.batch_size): # generate synthetic episodes under current policy if not args.model_rollout_before_rl: print(f"Generating {args.model_rollouts} MODEL episodes") rollout_batch_size = min(buffer.episodes_in_buffer, args.model_rollout_batch_size) model_batch = model_learner.generate_batch(buffer, rollout_batch_size, rl_iterations) model_buffer.insert_episode_batch(model_batch) if model_buffer.can_sample(args.batch_size): for _ in range(args.model_rl_iterations_per_generated_sample): episode_sample = model_buffer.sample(args.batch_size) # truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) # train RL agent learner.train(episode_sample, runner.t_env, rl_iterations) rl_iterations += 1 print(f"Model RL iteration {rl_iterations}, t_env: {runner.t_env}") if not collect_episodes and rl_iterations > 0 and rl_iterations % args.model_update_interval == 0: if args.max_model_trained == 0 or args.max_model_trained and n_model_trained < args.max_model_trained: print(f"Time to update model") collect_episodes = True train_rl = False # update stats model_learner.log_stats(runner.t_env) if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("model_rl_iterations", rl_iterations, runner.t_env) if (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0): print(f"Logging rl stats") model_learner.log_rl_stats(rl_iterations) else: episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if args.save_episodes and args.save_policy_outputs and args.runner == "episode": mac.save_policy_outputs() if buffer.can_sample(args.batch_size): for _ in range(args.batch_size_run): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) rl_iterations += 1 print(f"RL iteration {rl_iterations}, t_env: {runner.t_env}") # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if ((runner.t_env - last_test_T) / args.test_interval >= 1.0) or (rl_iterations > 0 and (rl_iterations - last_rl_T) /args.rl_test_interval >= 1.0): print("Running test cases") logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env last_rl_T = rl_iterations runner.t_rl = rl_iterations for _ in range(n_test_runs): runner.run(test_mode=True) logger.print_recent_stats() if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) if args.save_model and model_trained and (rl_iterations == 0 or (rl_iterations - rl_model_save_time)/args.rl_save_model_interval >= 1.0): print(f"Saving at RL model iteration {rl_iterations}") rl_model_save_time = rl_iterations save_path = os.path.join(args.local_results_path, "models", args.unique_token, f"rl_{rl_iterations}") # "results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("rl_iterations", rl_iterations, runner.t_env) logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Setup schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] # Default/base scheme reward_dict = {"vshape": (1,), "group": "agents", "dtype": th.float32} if args.env_args["reward_local"] else {"vshape": (1,)} scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "reward": reward_dict, "terminated": {"vshape": (1,), "dtype": th.uint8}, } # TODO: what is groups controlling? groups = { "agents": args.n_agents } # TODO: where/how is pre processing applied? preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } # TODO: why create replaybuffer with episode limit + 1? # Setup replaybuffer buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multi-agent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Setup runner with created scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Setup learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) # Activate CUDA if args.use_cuda: learner.cuda() # Load checkpoint if necessary if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 # Check checkpoint path integrity -> exist or else no model can be loaded later if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directory {} doesn't exist".format(args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) # TODO: enforce learner loading correct model? learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # # Start training # episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format(args.t_max)) while runner.t_env <= args.t_max: # Run for a whole episode at a time -> runner returns a episode batch episode_batch = runner.run(test_mode=False) # Save episode in replay buffer buffer.insert_episode_batch(episode_batch) # If enough episodes saved -> sample if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # Truncate batch to only filled timesteps # TODO: explain max_t_filled max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] # TODO: when is device differing?! if episode_sample.device != args.device: episode_sample.to(args.device) # Train on sampled episodes learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) # Save model after certain time if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) # Increase total episode counter by batch size of episodes currently run # TODO: follow batch_size_run! episode += args.batch_size_run # Log stats in interval if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")
def run_reptile(args, logger, _log, _run): loggers = {} runners = {} macs = {} learners = {} buffers = {} agent_state_dict = None import yaml #from .main import _get_config # compile all the relevant task configs task_configs = {} class Bunch(object): def __init__(self, adict): self.__dict__.update(adict) r = np.random.RandomState(args.seed) for k, v in sorted(args.tasks.items()): # important for reproducibility of seeds! # Get the defaults from default.yaml with open(os.path.join(os.path.dirname(__file__), "config", "default.yaml"), "r") as f: try: config_dict = yaml.load(f) except yaml.YAMLError as exc: assert False, "default.yaml error: {}".format(exc) # Load algorithm and env base configs params = ["", "--config={}".format(v.pop("config")), "--env-config={}".format(v.pop("env-config"))] alg_config = _get_config(params, "--config", "algs") env_config = _get_config(params, "--env-config", "envs") # config_dict = {**config_dict, **env_config, **alg_config} config_dict = recursive_dict_update(config_dict, env_config) config_dict = recursive_dict_update(config_dict, alg_config) config_dict = recursive_dict_update(config_dict, v) # from src.utils.dict2namedtuple import convert config_dict.pop("no-mongo") config_dict["seed"] = r.randint(0, 2**31-1) # have to set manually config_dict["env_args"]["seed"] = r.randint(0, 2**31-1) config_dict["device"] = args.device config_dict["unique_token"] = "{}__{}".format(args.unique_token, k) task_configs[k] = Bunch(config_dict) def setup_components(logger, agent_state_dict): task_names = [] for task_name, _ in task_configs.items(): task_names.append(task_name) # set up tasks based on the configs for task_name, task_config in task_configs.items(): task_args = task_config from copy import deepcopy logger = Logger(_log) # sacred is on by default logger.setup_sacred(_run) # logger = deepcopy(meta_logger) logger.prefix = task_name loggers[task_name] = logger # Init runner so we can get env info runner = r_REGISTRY[task_args.runner](args=task_args, logger=logger) runners[task_name] = runner # Set up schemes and groups here env_info = runner.get_env_info() task_args.n_agents = env_info["n_agents"] task_args.n_actions = env_info["n_actions"] task_args.obs_decoder = dill.loads(env_info["obs_decoder"]) if env_info["obs_decoder"] is not None else None task_args.avail_actions_encoder = dill.loads(env_info["avail_actions_encoder_grid"]) if env_info[ "avail_actions_encoder_grid"] is not None else None task_args.db_url = args.db_url task_args.db_name = args.db_name task_args.state_shape = env_info["state_shape"] task_args.state_decoder = dill.loads(env_info["state_decoder"]) if env_info["state_decoder"] is not None else None task_args.obs_decoder = dill.loads(env_info["obs_decoder"]) if env_info["obs_decoder"] is not None else None # Default/Base scheme scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents", "vshape_decoded": env_info.get("obs_shape_decoded", env_info["obs_shape"])}, "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "reward": {"vshape": (1,)}, "terminated": {"vshape": (1,), "dtype": th.uint8}, } groups = { "agents": task_args.n_agents } preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=task_args.n_actions)]) } buffer = ReplayBuffer(scheme, groups, task_args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if task_args.buffer_cpu_only else args.device) buffers[task_name] = buffer # Setup multiagent controller here mac = mac_REGISTRY[task_args.mac](buffer.scheme, groups, task_args) #point model to same object macs[task_name] = mac mac.agent = macs[task_names[0]].agent # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[task_args.learner](mac, buffer.scheme, logger, task_args) learners[task_name] = learner if task_args.use_cuda: learner.cuda() #if agent_state_dict is None: # agent_state_dict = mac.agent.state_dict() # else: # # copy all weights that have same dimensions # sd = mac.agent.state_dict() # for k, v in agent_state_dict.items(): # if (k in sd) and (v.shape == sd[k].shape): # setattr(mac.agent, k, v) if task_args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(task_args.checkpoint_path): logger.console_logger.info("Checkpoint directory {} doesn't exist".format(task_args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(task_args.checkpoint_path): full_name = os.path.join(task_args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if task_args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - task_args.load_step)) model_path = os.path.join(task_args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if task_args.evaluate or task_args.save_replay: evaluate_sequential(task_args, runner) return return from copy import deepcopy # agent_state_dict = setup_components(logger, agent_state_dict) setup_components(logger, agent_state_dict) # start reptile training episode_ctrs = {k:0 for k, _ in sorted(task_configs.items())} last_test_Ts = {k:-v.test_interval - 1 for k, v in sorted(task_configs.items())} last_times = {k:time.time() for k, v in sorted(task_configs.items())} model_save_times = {k:0 for k, _ in sorted(task_configs.items())} start_time = time.time() logger.console_logger.info("Beginning REPTILE training ...") previous_task_id = None unfinished_tasks = {k for k, v in task_configs.items() if episode_ctrs[k] <= v.t_max} while len(unfinished_tasks): # INNER LOOP unfinished_tasks = {k for k, v in task_configs.items() if episode_ctrs[k] <=v.t_max} # pick task from random import randint task_id = sorted(list(unfinished_tasks))[randint(0, len(unfinished_tasks)-1)] logger.console_logger.info("Chose task {} at global counter {}".format(task_id, sum(episode_ctrs.values()))) # roll out task a couple of times for t in range(args.n_task_rollouts[task_id]): episode_batch = runners[task_id].run(test_mode=False) buffers[task_id].insert_episode_batch(episode_batch) # train on task episode_ctrs[task_id] += 1 if episode_ctrs[task_id] >= task_configs[task_id].t_max: break # reset mac weights # copy all weights that have same dimensions from last chosen task (not sure whether this is not redundant) if previous_task_id is not None: sd = macs[task_id].agent.state_dict() for k, v in macs[previous_task_id].agent.state_dict().items(): if (k in sd) and (v.shape == sd[k].shape): setattr(macs[task_id].agent, k, v) # train for t in range(args.n_task_trains[task_id]): if buffers[task_id].can_sample(task_configs[task_id].batch_size): episode_sample = buffers[task_id].sample(task_configs[task_id].batch_size) max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != task_configs[task_id].device: episode_sample.to(task_configs[task_id].device) learners[task_id].train(episode_sample, runners[task_id].t_env, episode_ctrs[task_id]) # update weights of same dimensions using simple rule (otherwise: formulate as a gradient procedure) import operator for _task_id, _ in sorted(task_configs.items()): mac_state_dict = macs[task_id].agent.state_dict() if _task_id != task_id: _mac_state_dict = macs[_task_id].agent.state_dict() for k, v in _mac_state_dict.items(): if (k in mac_state_dict) and (v.shape == mac_state_dict[k].shape): new_weights = operator.attrgetter(k)(macs[_task_id].agent) + args.reptile_epsilon * (mac_state_dict[k] - v) setattr(macs[_task_id].agent, k, new_weights) # agent_state_dict[k] += args.reptile_epsilon * (mac_state_dict[k] - macs[_task_id].agent.state_dict()[k]) for task_id, task_config in task_configs.items(): # Execute test runs once in a while n_test_runs = max(1, task_configs[task_id].test_nepisode // runners[task_id].batch_size) if (runners[task_id].t_env - last_test_Ts[task_id]) / task_configs[task_id].test_interval >= 1.0: loggers[task_id].console_logger.info("Now testing: {}".format(task_id)) loggers[task_id].console_logger.info("t_env: {} / {}".format(runners[task_id].t_env, task_configs[task_id].t_max)) loggers[task_id].console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_times[task_id], last_test_Ts[task_id], runners[task_id].t_env, task_configs[task_id].t_max), time_str(time.time() - start_time))) last_times[task_id] = time.time() last_test_Ts[task_id] = runners[task_id].t_env for _ in range(n_test_runs): runners[task_id].run(test_mode=True) previous_task_id = task_id for task_id, task_config in task_configs.items(): if task_config.save_model and \ (runners[task_id].t_env - model_save_times[task_id] >= task_config.save_model_interval or model_save_times[task_id] == 0): model_save_times[task_id] = runners[task_id].t_env save_path = os.path.join(task_config.local_results_path, "models", task_config.unique_token, str(runners[task_id].t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learners[task_id].save_models(save_path)
def run_sequential(args, logger): """ 真正运行函数 :param args: :type args: :param logger: :type logger: :return: :rtype: """ # init runner所以我们可以得到env info, 运行哪个runner,是src/runners/parallel_runner.py中的ParallelRunner 还是episode_runner.py runner = r_REGISTRY[args.runner](args=args, logger=logger) # 在此设置schemes和groups env_info = runner.get_env_info() # agent的数量 eg: 8 args.n_agents = env_info["n_agents"] # 动作的数量 eg: 6 args.n_actions = env_info["n_actions"] # agent状态的维度: 300 args.state_shape = env_info["state_shape"] if getattr(args, 'agent_own_state_size', False): args.agent_own_state_size = get_agent_own_state_size(args.env_args) # 自定义schema scheme = { "state": {"vshape": env_info["state_shape"]}, "obs": {"vshape": env_info["obs_shape"], "group": "agents"}, "actions": {"vshape": (1,), "group": "agents", "dtype": th.long}, "avail_actions": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.int}, "probs": {"vshape": (env_info["n_actions"],), "group": "agents", "dtype": th.float}, "reward": {"vshape": (1,)}, "terminated": {"vshape": (1,), "dtype": th.uint8}, } groups = { "agents": args.n_agents } preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } # 重放buffer buffer = ReplayBuffer(scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # 在此设置多agent控制器,调用src/controllers/n_controller.py中的NMAC函数 mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # 给runner这个schema runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner, 调用src/learners/nq_learner.py下的NQLearner初始化, 不同的算法初始化 learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.checkpoint_path != "": # 加载checkpoint,继续训练 timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info("Checkpoint directiory {} doesn't exist".format(args.checkpoint_path)) return # 遍历args.checkpoint_path中的所有文件 for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # 检查它们是否是Dirs的名称是数字 if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # 开始训练 episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("开始训练,训练的 {} 个时间步".format(args.t_max)) while runner.t_env <= args.t_max: # 一个时间步运行一个episode with th.no_grad(): episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size): episode_sample = buffer.sample(args.batch_size) # 截断批次只保留有时间步的 max_ep_t = episode_sample.max_t_filled() episode_sample = episode_sample[:, :max_ep_t] if episode_sample.device != args.device: episode_sample.to(args.device) learner.train(episode_sample, runner.t_env, episode) del episode_sample # 执行测试运行一次 n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format(runner.t_env, args.t_max)) logger.console_logger.info("Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) if args.save_model and (runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("完成训练")
def run_sequential(args, logger): # Init runner so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) th.autograd.set_detect_anomaly(True) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] args.obs_shape = env_info["obs_shape"] # args.own_feature_size = env_info["own_feature_size"] #unit_type_bits+shield_bits_ally #if args.obs_last_action: # args.own_feature_size+=args.n_actions #if args.obs_agent_id: # args.own_feature_size+=args.n_agents # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, } if args.learner == "hierarchical_rode_learner": scheme.update({ "role_avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "roles": { "vshape": (1, ), "group": "agents", "dtype": th.long } }) if args.learner == "hierarchical_noise_q_learner": scheme.update({"noise": {"vshape": (args.noise_dim, )}}) groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here if args.q_net_ensemble: mac = [ mac_REGISTRY[args.mac](buffer.scheme, groups, args) for _ in range(args.ensemble_num) ] else: mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() if args.runner == "meta_noise": runner.cuda() if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( "Checkpoint directiory {} doesn't exist".format( args.checkpoint_path)) return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info("Loading model from {}".format(model_path)) learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return # start training episode = 0 last_test_T = -args.test_interval - 1 if args.meta_h: last_meta_T = -args.meta_h_interval - 1 meta_buffer = ReplayBuffer( scheme, groups, args.batch_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info("Beginning training for {} timesteps".format( args.t_max)) use_rode = True if args.learner == "hierarchical_rode_learner" else False meta_start_t = 0 if args.learner == "hierarchical_rode_learner": meta_start_t = args.role_action_spaces_update_start if args.save_batch_interval > 0: last_save_batch = -args.save_batch_interval - 1 whole_q_list = [] if args.save_q_all: q_list_ind = 0 while runner.t_env <= args.t_max: # Run for a whole episode at a time # if args.meta_h: # episode_batch, batch_log_p, mean_step_returns = runner.run(test_mode=False, meta_mode=True) # else: # episode_batch, _ = runner.run(test_mode=False) #[8,181,10,1] for actions episode_batch, _ = runner.run( test_mode=False, use_rode=use_rode) #[8,181,10,1] for actions buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size) and args.meta_h and \ (runner.t_env - last_meta_T) / args.meta_h_interval >= 1.0 and runner.t_env >= meta_start_t: repeat_times = args.batch_size // runner.batch_size # meta_buffer.insert_episode_batch(episode_batch) batch_log_p_all = [] mean_step_returns_all = [] for _ in range(repeat_times): #[8] # episode_batch, batch_log_p, mean_step_returns = runner.run_meta(test_mode=False, meta_mode=True) # batch_log_p_all.append(batch_log_p) episode_batch, _, mean_step_returns = runner.run_meta( test_mode=False, meta_mode=True, use_rode=use_rode) mean_step_returns_all += mean_step_returns buffer.insert_episode_batch(episode_batch[0]) meta_buffer.insert_episode_batch(episode_batch) #[32] # batch_log_p_all = th.cat(batch_log_p_all, dim=0) for _ in range(repeat_times): episode = prep_ep_and_train(meta_buffer, args, learner, episode, runner.t_env, whole_q_list) mean_step_returns_new_all = [] for _ in range(repeat_times): episode_batch_new, mean_step_returns_new = runner.run_meta( test_mode=False, use_rode=use_rode) buffer.insert_episode_batch(episode_batch_new[0]) mean_step_returns_new_all += mean_step_returns_new #need to get batch_log_p_here batch_log_p_all = runner.get_log_p(meta_buffer) learner.train_meta(batch_log_p_all, mean_step_returns_all, mean_step_returns_new_all, runner.t_env) for _ in range(repeat_times): episode = prep_ep_and_train(buffer, args, learner, episode, runner.t_env, whole_q_list) last_meta_T = runner.t_env elif buffer.can_sample(args.batch_size): prep_ep_and_train(buffer, args, learner, episode, runner.t_env, whole_q_list) # episode_sample = buffer.sample(args.batch_size) #[32,181,10,1] for actions # # Truncate batch to only filled timesteps # max_ep_t = episode_sample.max_t_filled() # episode_sample = episode_sample[:, :max_ep_t] # if episode_sample.device != args.device: # episode_sample.to(args.device) # learner.train(episode_sample, runner.t_env, episode) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info("t_env: {} / {}".format( runner.t_env, args.t_max)) logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env save_batch_flag = False discount = 1.0 if args.t_max // 5 <= runner.t_env else 10.0 if args.save_batch_interval > 0 and ( runner.t_env - last_save_batch) / ( args.save_batch_interval // discount) >= 1.0: save_batch_flag = True last_save_batch = runner.t_env for i in range(n_test_runs): if args.runner == "meta" or args.runner == "meta_noise": runner.run_meta(test_mode=True, use_rode=use_rode) else: runner.run(test_mode=True, use_rode=use_rode) if save_batch_flag: save_batch(runner.batch, osp.join(args.tb_logs, "batch"), runner.t_env, i) if args.noise_bandit: for _ in range(n_test_runs): runner.run_meta(test_mode=True, test_uniform=True) if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info("Saving models to {}".format(save_path)) # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if args.runner != "meta" and args.runner != "meta_noise" else 1 if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env if args.save_q_all and len(whole_q_list) >= 4000: save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind) whole_q_list.clear() q_list_ind += 1 if args.save_q_all and len(whole_q_list) > 0: save_q(whole_q_list, osp.join(args.tb_logs, "q"), q_list_ind) runner.close_env() logger.console_logger.info("Finished Training")
def run_sequential(args, logger): # Init runner(episode runner or parallel runner) so we can get env info runner = r_REGISTRY[args.runner](args=args, logger=logger) # Set up schemes and groups here env_info = runner.get_env_info() args.n_agents = env_info["n_agents"] # from smac maps args.n_actions = env_info["n_actions"] args.state_shape = env_info["state_shape"] # args.unit_type_bits = env_info["unit_type_bits"] # args.shield_bits_ally = env_info["shield_bits_ally"] # args.shield_bits_enemy = env_info["shield_bits_enemy"] # args.n_enemies = env_info["n_enemies"] # Default/Base scheme scheme = { "state": { "vshape": env_info["state_shape"] }, "obs": { "vshape": env_info["obs_shape"], "group": "agents" }, "actions": { "vshape": (1, ), "group": "agents", "dtype": th.long }, "avail_actions": { "vshape": (env_info["n_actions"], ), "group": "agents", "dtype": th.int }, "reward": { "vshape": (1, ) }, "terminated": { "vshape": (1, ), "dtype": th.uint8 }, #"policy": {"vshape": (env_info["n_agents"],)} } groups = {"agents": args.n_agents} preprocess = { "actions": ("actions_onehot", [OneHot(out_dim=args.n_actions)]) } buffer = ReplayBuffer( scheme, groups, args.buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) off_buffer = ReplayBuffer( scheme, groups, args.off_buffer_size, env_info["episode_limit"] + 1, preprocess=preprocess, device="cpu" if args.buffer_cpu_only else args.device) # Setup multiagent controller here mac = mac_REGISTRY[args.mac](buffer.scheme, groups, args) # Give runner the scheme runner.setup(scheme=scheme, groups=groups, preprocess=preprocess, mac=mac) # Learner learner = le_REGISTRY[args.learner](mac, buffer.scheme, logger, args) if args.use_cuda: learner.cuda() runner.set_learner(learner) ###### If checkpoint_path is given, and if args.evaluate == True or args.save_replay == True, ###### then, this function is returned without training. if args.checkpoint_path != "": timesteps = [] timestep_to_load = 0 if not os.path.isdir(args.checkpoint_path): logger.console_logger.info( f"Checkpoint directiory {args.checkpoint_path} doesn't exist") return # Go through all files in args.checkpoint_path for name in os.listdir(args.checkpoint_path): full_name = os.path.join(args.checkpoint_path, name) # Check if they are dirs the names of which are numbers if os.path.isdir(full_name) and name.isdigit(): timesteps.append(int(name)) if args.load_step == 0: # choose the max timestep timestep_to_load = max(timesteps) else: # choose the timestep closest to load_step timestep_to_load = min(timesteps, key=lambda x: abs(x - args.load_step)) model_path = os.path.join(args.checkpoint_path, str(timestep_to_load)) logger.console_logger.info(f"Loading model from {model_path}") learner.load_models(model_path) runner.t_env = timestep_to_load if args.evaluate or args.save_replay: evaluate_sequential(args, runner) return ######################################################################################################## ######## start training episode = 0 last_test_T = -args.test_interval - 1 last_log_T = 0 model_save_time = 0 start_time = time.time() last_time = start_time logger.console_logger.info( f"Beginning training for {args.t_max} timesteps") while runner.t_env <= args.t_max: # critic running log running_log = { "critic_loss": [], "critic_grad_norm": [], "td_error_abs": [], "target_mean": [], "q_taken_mean": [], "q_max_mean": [], "q_min_mean": [], "q_max_var": [], "q_min_var": [] } # Run for a whole episode at a time episode_batch = runner.run(test_mode=False) buffer.insert_episode_batch(episode_batch) off_buffer.insert_episode_batch(episode_batch) if buffer.can_sample(args.batch_size) and off_buffer.can_sample( args.off_batch_size): #train critic normall uni_episode_sample = buffer.uni_sample(args.batch_size) off_episode_sample = off_buffer.uni_sample(args.off_batch_size) max_ep_t = max(uni_episode_sample.max_t_filled(), off_episode_sample.max_t_filled()) uni_episode_sample = process_batch( uni_episode_sample[:, :max_ep_t], args) off_episode_sample = process_batch( off_episode_sample[:, :max_ep_t], args) learner.train_critic(uni_episode_sample, best_batch=off_episode_sample, log=running_log) #train actor episode_sample = buffer.sample_latest(args.batch_size) max_ep_t = episode_sample.max_t_filled() episode_sample = process_batch(episode_sample[:, :max_ep_t], args) learner.train(episode_sample, runner.t_env, running_log) # Execute test runs once in a while n_test_runs = max(1, args.test_nepisode // runner.batch_size) if (runner.t_env - last_test_T) / args.test_interval >= 1.0: logger.console_logger.info(f"t_env: {runner.t_env} / {args.t_max}") logger.console_logger.info( "Estimated time left: {}. Time passed: {}".format( time_left(last_time, last_test_T, runner.t_env, args.t_max), time_str(time.time() - start_time))) last_time = time.time() last_test_T = runner.t_env for _ in range(n_test_runs): runner.run(test_mode=True) # Save model every {save_model_interval} timesteps if args.save_model and ( runner.t_env - model_save_time >= args.save_model_interval or model_save_time == 0): model_save_time = runner.t_env save_path = os.path.join(args.local_results_path, "models", args.unique_token, str(runner.t_env)) #"results/models/{}".format(unique_token) os.makedirs(save_path, exist_ok=True) logger.console_logger.info(f"Saving models to {save_path}") # learner should handle saving/loading -- delegate actor save/load to mac, # use appropriate filenames to do critics, optimizer states learner.save_models(save_path) episode += args.batch_size_run if (runner.t_env - last_log_T) >= args.log_interval: logger.log_stat("episode", episode, runner.t_env) logger.print_recent_stats() last_log_T = runner.t_env runner.close_env() logger.console_logger.info("Finished Training")