logs_path = Path(args.experiment_path, "logs") logs_path.mkdir(parents=True, exist_ok=True) logs_path = str(logs_path) save_path = Path(args.experiment_path, "models") save_path.mkdir(parents=True, exist_ok=True) save_path = str(save_path) with open(Path(args.experiment_path, "config.yml"), "w") as config_file: yaml.dump(vars(args), config_file) # Initialize the environment args.vectorized = False env_builder = partial(gym.make, args.env) env_builder = compose(env_builder, GymEnv) env = env_builder() # The algorithm logger algo_logger = (None if logs_path is None else TensorboardLogger(logs_path + "/algo")) # Initialize IQN activation_fn = nn.ReLU optim = partial(torch.optim.Adam, lr=args.lr) if args.recurrent: num_lin_before = 1 if args.num_layers > 1 else 0 num_lin_after = max(args.num_layers - 2, 1) qfunc = LSTMPolicy(env.state_space[0], env.action_space[0],
def setup_model( args: Namespace ) -> Tuple[SpeedRunnersEnv, Callable[[Logger], TorchRLAlgo], TorchRLAgent]: """ Returns the setup environment, algorithm and a builder for an agent using the arguments given. Args: args (Namespace): The input arguments to setup the model, environment and agent. """ logs_path = None if args.experiment_path is not None: logs_path = Path(args.experiment_path, "logs") logs_path.mkdir(parents=True, exist_ok=True) logs_path = str(logs_path) # Environment env = SpeedRunnersEnv(args.episode_length, args.state_size, args.action_delay, not args.rgb, args.stacked_frames, args.window_size, args.device, args.read_memory) # The algorithm logger algo_logger = (None if logs_path is None else TensorboardLogger(logs_path + "/algo")) # Initialize SAC activation_fn = nn.ELU optim = partial(torch.optim.Adam, lr=args.lr) # Setup networks qfunc = LinearPolicy(1024, env.action_space[0], args.hidden_size, args.num_layers, activation_fn) autoencoder = Autoencoder((env.state_space[-1], 32, 64, 64), (8, 4, 3), (4, 2, 1), (0, 0, 0), activation_fn) algo = RainbowIQN(args.hidden_size, autoencoder, qfunc, args.discount, args.polyak, args.n_quantiles, args.embedding_dim, args.huber_threshold, args.target_update_interval, optim, optim, device=args.device, logger=algo_logger) if args.exploration == "rnd": rnd_network_autoencoder = deepcopy(autoencoder) rnd_network_linear = LinearPolicy(env.state_space[0], args.hidden_size, args.hidden_size, args.num_layers + 2, activation_fn) rnd_network = nn.Sequential(rnd_network_autoencoder, rnd_network_linear) rnd_target_autoencoder = deepcopy(autoencoder) rnd_target_linear = LinearPolicy(env.state_space[0], args.hidden_size, args.hidden_size, args.num_layers, activation_fn) rnd_target = nn.Sequential(rnd_target_autoencoder, rnd_target_linear) algo = RND(algo, rnd_network, rnd_target, optim) algo = algo.to(args.device) if args.load_path is not None: algo.load(args.load_path) agent_builder = partial(OffPolicyAgent, env, algo, silent=args.silent) agent_builder = compose(agent_builder, Agent) return env, algo, agent_builder
if args.experiment_path is not None: logs_path = Path(args.experiment_path, "logs") logs_path.mkdir(parents=True, exist_ok=True) logs_path = str(logs_path) save_path = Path(args.experiment_path, "models") save_path.mkdir(parents=True, exist_ok=True) save_path = str(save_path) with open(Path(args.experiment_path, "config.yml"), "w") as config_file: yaml.dump(vars(args), config_file) # Initialize the environment, and rescale for Tanh policy args.vectorized = False env_builder = partial(gym.make, args.env) env_builder = compose(env_builder, partial(RescaleAction, a=-1, b=1)) env_builder = compose(env_builder, GymEnv) env = env_builder() # Action masking if args.action_mask and not args.default_action: args.default_action = (0,) * len(args.action_mask) args.masked = True # Resize the action space to accomdate for the mask env.action_space = (sum(args.action_mask),) + env.action_space[1:] # The algorithm logger algo_logger = ( None if logs_path is None else TensorboardLogger(logs_path + "/algo") )
def train( self, args: Namespace, env_builder: Callable[[], Env], algo: RLAlgo ) -> None: """ Trains the algorithm on the environment given using the argument namespace as parameters. "args" must have the following attributes: { experiment_path (str): The path to save experiment results and models. render (bool): Render the environment. steps_per_episode (Optional[int]): The number of steps in each episode. silent (bool): Will run without standard output from agents. action_mask (Optional[Tuple[bool, ...]]): The action mask to mask or unmask. masked (Optional[bool]): If an action mask is given, should be True if the returned agent actions are already masked. default_action (Optional[Tuple[float, ...]]): If an action mask is given and going from masked -> unmasked, this should be the default values for the actions. decay (float): The gamma decay for the target Q-values. n_steps (int): The number of decay steps. num_agents (int): The number of agents to run concurrently, 0 is single process. model_sync_interval (int): The number of training steps between agent model syncs, if 0, all processes will share the same model. num_prefetch_batches (int): The number of batches to prefetch to the learner in distributed learning. local_batch_size (int): The number of experiences the agent sends at once in distributed learning. vectorized (bool): If the environment is vectorized. recurrent (bool),Make the network recurrent (using LSTM) play (bool): Runs the environment using the model instead of training. exploration (str, ["rnd", "munchausen"]): The type of exploration to use. episodes (int): The number of episodes to play for if playing. er_capacity (int): The alpha value for PER. batch_size (int): The batch size of the training set. training_steps (int): The number of training steps to train for. start_size (int): The size of the replay buffer before training. er_alpha (float): The alpha value for PER. er_beta (float): The alpha value for PER. er_beta_increment (float): The increment of the beta value on each sample for PER. er_epsilon (float): The epsilon value for PER. burn_in_length (int): If recurrent, the number of burn in samples for R2D2. sequence_length (int): If recurrent, the length of the sequence to train on. max_factor (int): If recurrent, factor of max priority to mean priority for R2D2. } Args: args: The namespace of arguments for training. env_builder: The nullary function to create the environment. algo: The algorithm to train. """ logs_path = None save_path = None if args.experiment_path is not None: logs_path = Path(args.experiment_path, "logs") logs_path.mkdir(parents=True, exist_ok=True) logs_path = str(logs_path) save_path = Path(args.experiment_path, "models") save_path.mkdir(parents=True, exist_ok=True) save_path = str(save_path) # Create agent class agent_builder = partial( OffPolicyAgent, algo=algo, render=args.render, silent=args.silent ) steps_per_episode = ( args.steps_per_episode if "steps_per_episode" in args else None ) agent_builder = compose( agent_builder, partial(TimeLimitAgent, max_steps=steps_per_episode) ) if not args.play: # Experience replay # Won't increment in multiple processes to keep it consistent # across actors er_beta_increment = ( args.er_beta_increment if args.num_agents == 0 else 0 ) if args.recurrent: experience_replay_func = partial( TorchR2D2, alpha=args.er_alpha, beta=args.er_beta, beta_increment=er_beta_increment, epsilon=args.er_epsilon, max_factor=args.max_factor ) else: experience_replay_func = partial( TorchPER, alpha=args.er_alpha, beta=args.er_beta, beta_increment=er_beta_increment, epsilon=args.er_epsilon ) if args.num_agents > 0: recv_pipes = [] send_pipes = [] prestart_func = None if args.model_sync_interval == 0: self._start_training(algo, args) algo.share_memory() recv_pipes = [None] * args.num_agents else: prestart_func = partial( self._start_training, algo=algo, args=args ) # Force CPU for now to avoid re-instantiating cuda in # subprocesses algo.device = torch.device("cpu") algo = algo.to(algo.device) for i in range(args.num_agents): param_pipe = mp.Pipe(duplex=False) recv_pipes.append(param_pipe[0]) send_pipes.append(param_pipe[1]) # Just needed to get the error/priority calculations dummy_experience_replay = experience_replay_func(capacity=1) # Must come before the other wrapper since there are infinite # recursion errors # TODO come up with a better way to implement wrappers agent_builder = compose( agent_builder, partial_iterator( QueueAgent, agent_id=(iter(range(args.num_agents)), True), experience_replay=(dummy_experience_replay, False), param_pipe=(iter(recv_pipes), True) ) ) agent_builder = compose( agent_builder, partial(TorchRLAgent, batch_state=not args.vectorized) ) if "action_mask" in args and args.action_mask: # TODO: Will have to add an action mask wrapper later if args.masked: agent_builder = compose( agent_builder, partial( UnmaskedActionAgent, action_mask=args.action_mask, default_action=args.default_action ) ) agent_builder = compose(agent_builder, TorchOffPolicyAgent) if args.recurrent: agent_builder = compose( agent_builder, SequenceInputAgent, TorchRecurrentAgent ) if args.play: algo = algo.to(args.device) algo.eval() agent_logger = ( None if logs_path is None else TensorboardLogger(logs_path + "/play-agent") ) agent = agent_builder(env=env_builder(), logger=agent_logger) agent.play(args.episodes) else: if args.exploration == "rnd": agent_builder = compose(agent_builder, IntrinsicRewardAgent) elif args.exploration == "munchausen": agent_builder = compose( agent_builder, partial(MunchausenAgent, alpha=0.9) ) algo.train() if args.recurrent: agent_builder = compose( agent_builder, partial( ExperienceSequenceAgent, sequence_length=( args.burn_in_length + args.sequence_length ), overlap=args.burn_in_length ) ) experience_replay = experience_replay_func( capacity=args.er_capacity ) base_agent_logs_path = None if logs_path is not None: base_agent_logs_path = logs_path + "/train-agent" # Single process if args.num_agents == 0: self._start_training(algo, args) agent_logger = None if base_agent_logs_path is not None: agent_logger = TensorboardLogger(base_agent_logs_path) agent = agent_builder(env=env_builder(), logger=agent_logger) agent.train( args.episodes, 1, args.discount, args.n_steps, experience_replay, args.batch_size, args.start_size, save_path, args.save_interval ) # Multiple processes else: done_event = mp.Event() # Number of agents + worker + learner queue_barrier = mp.Barrier(args.num_agents + 2) agent_queue = mp.Queue( maxsize=args.num_prefetch_batches * args.num_agents * 4 ) sample_queue = mp.Queue(maxsize=args.num_prefetch_batches) priority_queue = mp.Queue(maxsize=args.num_prefetch_batches) learner_args = (dummy_experience_replay,) learner_train_args = ( algo, done_event, queue_barrier, args.training_steps, sample_queue, priority_queue, send_pipes, args.model_sync_interval, save_path, args.save_interval ) worker = TorchApexWorker() worker_args = ( experience_replay, done_event, queue_barrier, agent_queue, sample_queue, priority_queue, args.batch_size, args.start_size ) agent_builders = [] agent_train_args = [] agent_train_kwargs = [] for i in range(args.num_agents): agent_logger = None if base_agent_logs_path is not None: agent_logs_path = ( base_agent_logs_path + "-" + str(i + 1) ) agent_logger = TensorboardLogger(agent_logs_path) agent_builders.append( partial(agent_builder, logger=agent_logger) ) agent_train_args.append(( 1, args.local_batch_size, args.discount, args.n_steps, agent_queue, queue_barrier )) agent_train_kwargs.append({ "exit_condition": done_event.is_set }) runner = ApexRunner(done_event) runner.start( learner_args, learner_train_args, worker, worker_args, env_builder, agent_builders, agent_train_args, agent_train_kwargs, prestart_func )
logs_path = str(logs_path) save_path = Path(args.experiment_path, "models") save_path.mkdir(parents=True, exist_ok=True) save_path = str(save_path) env, algo, agent_builder = setup_model(args) algo.create_optimizers() algo.train() algo.share_memory() # Finish setting up agent if args.exploration == "rnd": agent_builder = compose([agent_builder, IntrinsicRewardAgent]) experience_replay = TorchPER(args.er_capacity, args.er_alpha, args.er_beta, args.er_beta_increment, args.er_epsilon) base_agents_logs_path = None if logs_path is not None: base_agents_logs_path = logs_path + "/train-agent-" # Can't reinitialize CUDA on windows, so no parallelization in this format # Also some pickle problems on windows, need to investigate """ if str(args.device) == "cpu": done_event = mp.Event() agent_queue = mp.Queue()