Ejemplo n.º 1
0
        save_path = Path(args.experiment_path, "models")
        save_path.mkdir(parents=True, exist_ok=True)
        save_path = str(save_path)

        with open(Path(args.experiment_path, "config.yml"),
                  "w") as config_file:
            yaml.dump(vars(args), config_file)

    # Initialize the environment
    args.vectorized = False
    env_builder = partial(gym.make, args.env)
    env_builder = compose(env_builder, GymEnv)
    env = env_builder()

    # The algorithm logger
    algo_logger = (None if logs_path is None else TensorboardLogger(logs_path +
                                                                    "/algo"))

    # Initialize IQN
    activation_fn = nn.ReLU
    optim = partial(torch.optim.Adam, lr=args.lr)

    if args.recurrent:
        num_lin_before = 1 if args.num_layers > 1 else 0
        num_lin_after = max(args.num_layers - 2, 1)

        qfunc = LSTMPolicy(env.state_space[0], env.action_space[0],
                           args.hidden_size, num_lin_before, args.hidden_size,
                           1, args.hidden_size, num_lin_after, activation_fn)

        algo = DQNRecurrent(qfunc, args.discount**args.n_steps, args.polyak,
                            args.target_update_interval, optim, args.device,
Ejemplo n.º 2
0
    import torch.multiprocessing as mp

    from pathlib import Path

    from speedrunnersai.setup_model import setup_model
    from speedrunnersai.config.argument_parser import get_args
    from hlrl.core.logger import TensorboardLogger

    args = get_args()

    logs_path = None

    if args.experiment_path is not None:
        logs_path = Path(args.experiment_path, "logs")
        logs_path.mkdir(parents=True, exist_ok=True)
        logs_path = str(logs_path)

    env, algo, agent_builder = setup_model(args)

    algo.eval()

    agent_logger = (
        None if logs_path is None
        else TensorboardLogger(logs_path + "/play-agent")
    )

    agent = agent_builder(logger=agent_logger)

    env.start()
    agent.play(args.episodes)
    env.stop()
Ejemplo n.º 3
0
    def train(
            self,
            args: Namespace,
            env_builder: Callable[[], Env],
            algo: RLAlgo
        ) -> None:
        """
        Trains the algorithm on the environment given using the argument
        namespace as parameters.
        
        "args" must have the following attributes:
        {
            experiment_path (str): The path to save experiment results and
                models.
            render (bool): Render the environment.
            steps_per_episode (Optional[int]): The number of steps in each
                episode.
            silent (bool): Will run without standard output from agents.
            action_mask (Optional[Tuple[bool, ...]]): The action mask to mask or
                unmask.
            masked (Optional[bool]): If an action mask is given, should be True
                if the returned agent actions are already masked.
            default_action (Optional[Tuple[float, ...]]): If an action mask is
                given and going from masked -> unmasked, this should be the
                default values for the actions.
            decay (float): The gamma decay for the target Q-values.
            n_steps (int): The number of decay steps.
            num_agents (int): The number of agents to run concurrently, 0 is
                single process.
            model_sync_interval (int): The number of training steps between
                agent model syncs, if 0, all processes will share the same
                model.
            num_prefetch_batches (int): The number of batches to prefetch to the
                learner in distributed learning.
            local_batch_size (int): The number of experiences the agent sends at
                once in distributed learning.
            vectorized (bool): If the environment is vectorized.
            recurrent (bool),Make the network recurrent (using LSTM)
            play (bool): Runs the environment using the model instead of
                training.
            exploration (str, ["rnd", "munchausen"]): The type of exploration to
                use.
		    episodes (int): The number of episodes to play for if playing.
            er_capacity (int): The alpha value for PER.
            batch_size (int): The batch size of the training set.
            training_steps (int): The number of training steps to train for.
            start_size (int): The size of the replay buffer before training.
            er_alpha (float): The alpha value for PER.
            er_beta (float): The alpha value for PER.
            er_beta_increment (float): The increment of the beta value on each
                sample for PER.
            er_epsilon (float): The epsilon value for PER.
            burn_in_length (int): If recurrent, the number of burn in samples
                for R2D2.
            sequence_length (int): If recurrent, the length of the sequence to
                train on.
            max_factor (int): If recurrent, factor of max priority to mean
                priority for R2D2.
        }

        Args:
            args: The namespace of arguments for training.
            env_builder: The nullary function to create the environment.
            algo: The algorithm to train.
        """
        logs_path = None
        save_path = None

        if args.experiment_path is not None:
            logs_path = Path(args.experiment_path, "logs")
            logs_path.mkdir(parents=True, exist_ok=True)
            logs_path = str(logs_path)

            save_path = Path(args.experiment_path, "models")
            save_path.mkdir(parents=True, exist_ok=True)
            save_path = str(save_path)

        # Create agent class
        agent_builder = partial(
            OffPolicyAgent, algo=algo, render=args.render, silent=args.silent
        )

        steps_per_episode = (
            args.steps_per_episode if "steps_per_episode" in args else None
        )

        agent_builder = compose(
            agent_builder,
            partial(TimeLimitAgent, max_steps=steps_per_episode)
        )

        if not args.play:
            # Experience replay
            # Won't increment in multiple processes to keep it consistent
            # across actors
            er_beta_increment = (
                args.er_beta_increment if args.num_agents == 0 else 0
            )

            if args.recurrent:
                experience_replay_func = partial(
                    TorchR2D2, alpha=args.er_alpha, beta=args.er_beta,
                    beta_increment=er_beta_increment, epsilon=args.er_epsilon,
                    max_factor=args.max_factor
                )
            else:
                experience_replay_func = partial(
                    TorchPER, alpha=args.er_alpha, beta=args.er_beta,
                    beta_increment=er_beta_increment, epsilon=args.er_epsilon
                )

            if args.num_agents > 0:
                recv_pipes = []
                send_pipes = []

                prestart_func = None

                if args.model_sync_interval == 0:
                    self._start_training(algo, args)
                    algo.share_memory()

                    recv_pipes = [None] * args.num_agents
                else:
                    prestart_func = partial(
                        self._start_training, algo=algo, args=args
                    )

                    # Force CPU for now to avoid re-instantiating cuda in
                    # subprocesses
                    algo.device = torch.device("cpu")
                    algo = algo.to(algo.device)

                    for i in range(args.num_agents):
                        param_pipe = mp.Pipe(duplex=False)

                        recv_pipes.append(param_pipe[0])
                        send_pipes.append(param_pipe[1])

                # Just needed to get the error/priority calculations
                dummy_experience_replay = experience_replay_func(capacity=1)

                # Must come before the other wrapper since there are infinite
                # recursion errors
                # TODO come up with a better way to implement wrappers
                agent_builder = compose(
                    agent_builder,
                    partial_iterator(
                        QueueAgent,
                        agent_id=(iter(range(args.num_agents)), True),
                        experience_replay=(dummy_experience_replay, False),
                        param_pipe=(iter(recv_pipes), True)
                    )
                )

        agent_builder = compose(
            agent_builder,
            partial(TorchRLAgent, batch_state=not args.vectorized)
        )
        
        if "action_mask" in args and args.action_mask:
            # TODO: Will have to add an action mask wrapper later
            if args.masked:
                agent_builder = compose(
                    agent_builder,
                    partial(
                        UnmaskedActionAgent, action_mask=args.action_mask,
                        default_action=args.default_action
                    )
                )

        agent_builder = compose(agent_builder, TorchOffPolicyAgent)

        if args.recurrent:
            agent_builder = compose(
                agent_builder, SequenceInputAgent, TorchRecurrentAgent
            )

        if args.play:
            algo = algo.to(args.device)
            algo.eval()

            agent_logger = (
                None if logs_path is None
                else TensorboardLogger(logs_path + "/play-agent")
            )

            agent = agent_builder(env=env_builder(), logger=agent_logger)
            agent.play(args.episodes)
        else:
            if args.exploration == "rnd":
                agent_builder = compose(agent_builder, IntrinsicRewardAgent)
            elif args.exploration == "munchausen":
                agent_builder = compose(
                    agent_builder, partial(MunchausenAgent, alpha=0.9)
                )

            algo.train()

            if args.recurrent:
                agent_builder = compose(
                    agent_builder,
                    partial(
                        ExperienceSequenceAgent,
                        sequence_length=(
                            args.burn_in_length + args.sequence_length
                        ),
                        overlap=args.burn_in_length
                    )
                )

            experience_replay = experience_replay_func(
                capacity=args.er_capacity
            )

            base_agent_logs_path = None
            if logs_path is not None:
                base_agent_logs_path = logs_path + "/train-agent"

            # Single process
            if args.num_agents == 0:
                self._start_training(algo, args)

                agent_logger = None
                if base_agent_logs_path is not None:
                    agent_logger = TensorboardLogger(base_agent_logs_path)

                agent = agent_builder(env=env_builder(), logger=agent_logger)

                agent.train(
                    args.episodes, 1, args.discount, args.n_steps,
                    experience_replay, args.batch_size, args.start_size,
                    save_path, args.save_interval
                )

            # Multiple processes
            else:
                done_event = mp.Event()

                # Number of agents + worker + learner
                queue_barrier = mp.Barrier(args.num_agents + 2)

                agent_queue = mp.Queue(
                    maxsize=args.num_prefetch_batches * args.num_agents * 4
                )
                sample_queue = mp.Queue(maxsize=args.num_prefetch_batches)
                priority_queue = mp.Queue(maxsize=args.num_prefetch_batches)

                learner_args = (dummy_experience_replay,)
                learner_train_args = (
                    algo, done_event, queue_barrier, args.training_steps,
                    sample_queue, priority_queue, send_pipes,
                    args.model_sync_interval, save_path, args.save_interval
                )

                worker = TorchApexWorker()
                worker_args = (
                    experience_replay, done_event, queue_barrier, agent_queue,
                    sample_queue, priority_queue, args.batch_size,
                    args.start_size
                )

                agent_builders = []
                agent_train_args = []
                agent_train_kwargs = []

                for i in range(args.num_agents):
                    agent_logger = None
                    if base_agent_logs_path is not None:
                        agent_logs_path = (
                            base_agent_logs_path + "-" + str(i + 1)
                        )
                        agent_logger = TensorboardLogger(agent_logs_path)

                    agent_builders.append(
                        partial(agent_builder, logger=agent_logger)
                    )

                    agent_train_args.append((
                        1, args.local_batch_size, args.discount, args.n_steps,
                        agent_queue, queue_barrier
                    ))
                    agent_train_kwargs.append({
                        "exit_condition": done_event.is_set
                    })

                runner = ApexRunner(done_event)
                runner.start(
                    learner_args, learner_train_args, worker, worker_args,
                    env_builder, agent_builders, agent_train_args,
                    agent_train_kwargs, prestart_func
                )
Ejemplo n.º 4
0
def setup_model(
    args: Namespace
) -> Tuple[SpeedRunnersEnv, Callable[[Logger], TorchRLAlgo], TorchRLAgent]:
    """
    Returns the setup environment, algorithm and a builder for an agent using
    the arguments given.

    Args:
        args (Namespace): The input arguments to setup the model, environment
            and agent.
    """
    logs_path = None

    if args.experiment_path is not None:
        logs_path = Path(args.experiment_path, "logs")
        logs_path.mkdir(parents=True, exist_ok=True)
        logs_path = str(logs_path)

    # Environment
    env = SpeedRunnersEnv(args.episode_length, args.state_size,
                          args.action_delay, not args.rgb, args.stacked_frames,
                          args.window_size, args.device, args.read_memory)

    # The algorithm logger
    algo_logger = (None if logs_path is None else TensorboardLogger(logs_path +
                                                                    "/algo"))

    # Initialize SAC
    activation_fn = nn.ELU
    optim = partial(torch.optim.Adam, lr=args.lr)

    # Setup networks
    qfunc = LinearPolicy(1024, env.action_space[0], args.hidden_size,
                         args.num_layers, activation_fn)

    autoencoder = Autoencoder((env.state_space[-1], 32, 64, 64), (8, 4, 3),
                              (4, 2, 1), (0, 0, 0), activation_fn)

    algo = RainbowIQN(args.hidden_size,
                      autoencoder,
                      qfunc,
                      args.discount,
                      args.polyak,
                      args.n_quantiles,
                      args.embedding_dim,
                      args.huber_threshold,
                      args.target_update_interval,
                      optim,
                      optim,
                      device=args.device,
                      logger=algo_logger)

    if args.exploration == "rnd":
        rnd_network_autoencoder = deepcopy(autoencoder)
        rnd_network_linear = LinearPolicy(env.state_space[0], args.hidden_size,
                                          args.hidden_size,
                                          args.num_layers + 2, activation_fn)
        rnd_network = nn.Sequential(rnd_network_autoencoder,
                                    rnd_network_linear)

        rnd_target_autoencoder = deepcopy(autoencoder)
        rnd_target_linear = LinearPolicy(env.state_space[0], args.hidden_size,
                                         args.hidden_size, args.num_layers,
                                         activation_fn)
        rnd_target = nn.Sequential(rnd_target_autoencoder, rnd_target_linear)

        algo = RND(algo, rnd_network, rnd_target, optim)

    algo = algo.to(args.device)

    if args.load_path is not None:
        algo.load(args.load_path)

    agent_builder = partial(OffPolicyAgent, env, algo, silent=args.silent)
    agent_builder = compose(agent_builder, Agent)

    return env, algo, agent_builder
Ejemplo n.º 5
0
            if base_agents_logs_path is not None:
                agent_logs_path = base_agents_logs_path + str(i + 1)
                agent_logger = TensorboardLogger(agent_logs_path)

            agents.append(agent_builder(logger=agent_logger))
            agent_train_args.append((
                done_event, args.decay, args.n_steps, agent_queue
            ))

        runner = ApexRunner(done_event)

        env.start()
        runner.start(learner_args, worker_args, agents, agent_train_args)
        env.stop()
    else:
    """
    agent_logger = None
    if base_agents_logs_path is not None:
        agent_logs_path = base_agents_logs_path + "0"
        agent_logger = TensorboardLogger(agent_logs_path)

    agent = agent_builder(logger=agent_logger)

    env.start()

    agent.train(args.training_steps + args.start_size, args.decay,
                args.n_steps, experience_replay, algo, args.batch_size,
                args.start_size, save_path, args.save_interval)

    env.stop()