Ejemplo n.º 1
0
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""
    def __init__(self,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_rim=False):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            obs_space)
        self.acmodel = ACModel(obs_space, action_space, use_rim=use_rim)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size).to(device)

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))

    def get_actions(self, obss):
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad():
            if self.acmodel.recurrent:
                dist, _, self.memories = self.acmodel(preprocessed_obss,
                                                      self.memories)
            else:
                dist, _ = self.acmodel(preprocessed_obss)

        if self.argmax:
            actions = dist.probs.max(1, keepdim=True)[1]
        else:
            actions = dist.sample()

        return actions.cpu().numpy()

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(dones, dtype=torch.float).to(
                self.device).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
Ejemplo n.º 2
0
def create_model(obs_space, envs):
    """Helper function to create new model faster."""
    cuda = torch.cuda.is_available()
    device = torch.device("cuda" if cuda else "cpu")
    model = ACModel(obs_space, envs[0].action_space)
    model = model.to(device)
    return model
Ejemplo n.º 3
0
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""

    def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space)
        self.acmodel = ACModel(obs_space, action_space)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()

    def get_actions(self, obss):
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad():
            dist, _ = self.acmodel(preprocessed_obss)

        if self.argmax:
            actions = dist.probs.max(1, keepdim=True)[1]
        else:
            actions = dist.sample()

        return actions.cpu().numpy()

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        pass

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
Ejemplo n.º 4
0
        status = {"num_frames": 0, "update": 0}

    # Load observations preprocessor

    obs_space, preprocess_obs_goals = utils.get_obs_goals_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obs_goals.vocab.load_vocab(status["vocab"])
    txt_logger.info("observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
        txt_logger.info("Model loaded\n")
    acmodel.to(device)
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = a2c.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps, preprocess_obs_goals)
    elif args.algo == "ppo":
        algo = ppo.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obs_goals)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))
Ejemplo n.º 5
0
def main():
    # Parse arguments

    parser = argparse.ArgumentParser()

    ## General parameters
    parser.add_argument(
        "--algo",
        required=True,
        help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)")
    parser.add_argument("--env",
                        required=True,
                        help="name of the environment to train on (REQUIRED)")
    parser.add_argument(
        "--model",
        default=None,
        help="name of the model (default: {ENV}_{ALGO}_{TIME})")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval",
                        type=int,
                        default=1,
                        help="number of updates between two logs (default: 1)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=10,
        help=
        "number of updates between two saves (default: 10, 0 means no saving)")
    parser.add_argument("--procs",
                        type=int,
                        default=16,
                        help="number of processes (default: 16)")
    parser.add_argument("--frames",
                        type=int,
                        default=10**7,
                        help="number of frames of training (default: 1e7)")

    ## Parameters for main algorithm
    parser.add_argument("--epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="batch size for PPO (default: 256)")
    parser.add_argument(
        "--frames-per-proc",
        type=int,
        default=None,
        help=
        "number of frames per process before update (default: 5 for A2C and 128 for PPO)"
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="learning rate (default: 0.001)")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.95,
        help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)"
    )
    parser.add_argument("--entropy-coef",
                        type=float,
                        default=0.01,
                        help="entropy term coefficient (default: 0.01)")
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument(
        "--optim-eps",
        type=float,
        default=1e-8,
        help="Adam and RMSprop optimizer epsilon (default: 1e-8)")
    parser.add_argument("--optim-alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha (default: 0.99)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument(
        "--recurrence",
        type=int,
        default=1,
        help=
        "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory."
    )
    parser.add_argument("--text",
                        action="store_true",
                        default=False,
                        help="add a GRU to the model to handle text input")
    parser.add_argument("--visualize",
                        default=False,
                        help="show real time CNN layer weight changes")

    args = parser.parse_args()

    args.mem = args.recurrence > 1

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"

    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps,
                                preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs,
                                args.batch_size, preprocess_obss)

    elif args.algo == "ppo_intrinsic":
        algo = torch_ac.PPOAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)
    elif args.algo == "a2c_intrinsic":
        algo = torch_ac.A2CAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_alpha,
            args.optim_eps, preprocess_obss)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    print_visual = args.visualize
    if print_visual:
        fig, axs = plt.subplots(1, 3)
        fig.suptitle('Convolution Layer Weights Normalized Difference')

    while num_frames < args.frames:

        # Store copies of s_t model params
        old_parameters = {}
        for name, param in acmodel.named_parameters():
            old_parameters[name] = param.detach().numpy().copy()

        # Update model parameters
        update_start_time = time.time()
        exps, logs1 = algo.collect_experiences()
        logs2 = algo.update_parameters(exps)
        logs = {**logs1, **logs2}
        update_end_time = time.time()

        # Store copies of s_t+1 model params
        new_parameters = {}
        for name, param in acmodel.named_parameters():
            new_parameters[name] = param.detach().numpy().copy()

        # Compute L2 Norm of model state differences
        # Print model weight change visualization
        for index in range(len(old_parameters.keys())):
            if index == 0 or index == 2 or index == 4:
                key = list(old_parameters.keys())[index]
                old_weights = old_parameters[key]
                new_weights = new_parameters[key]
                norm_diff = numpy.linalg.norm(new_weights - old_weights)
                diff_matrix = abs(new_weights - old_weights)
                diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0],
                                                    norm='max',
                                                    axis=0)
                if print_visual:
                    axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0],
                                               cmap='Greens',
                                               interpolation='nearest')

        # This allows the plots to update as the model trains
        if print_visual:
            plt.ion()
            plt.show()
            plt.pause(0.001)

        num_frames += logs["num_frames"]
        update += 1

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value", "policy_loss", "value_loss", "grad_norm"
            ]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"], logs["grad_norm"]
            ]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {
                "num_frames": num_frames,
                "update": update,
                "model_state": acmodel.state_dict(),
                "optimizer_state": algo.optimizer.state_dict()
            }
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
Ejemplo n.º 6
0
# Load observations preprocessor

obs_space, preprocess_obss = utils.get_obss_preprocessor(
    envs[0].observation_space)
if "vocab" in status:
    preprocess_obss.vocab.load_vocab(status["vocab"])
txt_logger.info("Observations preprocessor loaded")

# Load model

acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
# historical_models = [acmodel]
if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
acmodel.to(device)
txt_logger.info("Model loaded\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo

# if args.algo == "a2c":
#     algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
#                             args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
#                             args.optim_alpha, args.optim_eps, preprocess_obss)
# elif args.algo == "ppo":
#     algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
#                             args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
#                             args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss)
# else:
#     raise ValueError("Incorrect algorithm name: {}".format(args.algo))
Ejemplo n.º 7
0
class Agent:
    def __init__(self,
                 env,
                 model_dir,
                 model_type='PPO2',
                 logger=None,
                 argmax=False,
                 use_memory=False,
                 use_text=False,
                 num_cpu=1,
                 frames_per_proc=None,
                 discount=0.99,
                 lr=0.001,
                 gae_lambda=0.95,
                 entropy_coef=0.01,
                 value_loss_coef=0.5,
                 max_grad_norm=0.5,
                 recurrence=1,
                 optim_eps=1e-8,
                 optim_alpha=None,
                 clip_eps=0.2,
                 epochs=4,
                 batch_size=256):
        """
        Initialize the Agent object.

        This primarily includes storing of the configuration parameters, but there is some other logic for correctly
        initializing the agent.

        :param env: the environment for training
        :param model_dir: the save directory (appended with the goal_id in initialization)
        :param model_type: the type of model {'PPO2', 'A2C'}
        :param logger: existing text logger
        :param argmax: if we use determinsitic or probabilistic action selection
        :param use_memory: if we are using an LSTM
        :param use_text: if we are using NLP to parse the goal
        :param num_cpu: the number of parallel instances for training
        :param frames_per_proc: max time_steps per process (versus constant)
        :param discount: the discount factor (gamma)
        :param lr: the learning rate
        :param gae_lambda: the generalized advantage estimator lambda parameter (training smoothing parameter)
        :param entropy_coef: relative weight for entropy loss
        :param value_loss_coef: relative weight for value function loss
        :param max_grad_norm: max scaling factor for the gradient
        :param recurrence: number of recurrent steps
        :param optim_eps: minimum value to prevent numerical instability
        :param optim_alpha: RMSprop decay parameter (A2C only)
        :param clip_eps: clipping parameter for the advantage and value function (PPO2 only)
        :param epochs: number of epochs in the parameter update (PPO2 only)
        :param batch_size: number of samples for the parameter update (PPO2 only)
        """
        if hasattr(
                env, 'goal'
        ) and env.goal:  # if the environment has a goal, set the model_dir to the goal folder
            self.model_dir = model_dir + env.goal.goalId + '/'
        else:  # otherwise just use the model_dir as is
            self.model_dir = model_dir

        # store all of the input parameters
        self.model_type = model_type
        self.num_cpu = num_cpu
        self.frames_per_proc = frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.optim_eps = optim_eps
        self.optim_alpha = optim_alpha
        self.clip_eps = clip_eps
        self.epochs = epochs
        self.batch_size = batch_size

        # use the existing logger and create two new ones
        self.txt_logger = logger
        self.csv_file, self.csv_logger = utils.get_csv_logger(self.model_dir)
        self.tb_writer = tensorboardX.SummaryWriter(self.model_dir)

        self.set_env(
            env
        )  # set the environment to with some additional checks and init of training_envs

        self.algo = None  # we don't initialize the algorithm until we call init_training_algo()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.txt_logger.info(f"Device: {device}\n")

        try:  # if we have a saved model, load it
            self.status = utils.get_status(self.model_dir)
        except OSError:  # otherwise initialize the status
            print('error loading saved model.  initializing empty model...')
            self.status = {"num_frames": 0, "update": 0}
        if self.txt_logger: self.txt_logger.info("Training status loaded\n")

        if "vocab" in self.status:
            preprocess_obss.vocab.load_vocab(self.status["vocab"])
        if self.txt_logger:
            self.txt_logger.info("Observations preprocessor loaded")

        # get the obs_space and the observation pre-processor
        # (for manipulating gym observations into a torch-friendly format)
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            self.env.observation_space)
        self.acmodel = ACModel(obs_space,
                               self.env.action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device  # store the device {'cpu', 'cuda:N'}
        self.argmax = argmax  # if we are using greedy action selection
        # or are we using probabilistic action selection

        if self.acmodel.recurrent:  # initialize the memories
            self.memories = torch.zeros(num_cpu,
                                        self.acmodel.memory_size,
                                        device=self.device)

        if "model_state" in self.status:  # if we have a saved model ('model_state') in the status
            # load that into the initialized model
            self.acmodel.load_state_dict(self.status["model_state"])
        self.acmodel.to(
            device)  # make sure the model is located on the correct device
        self.txt_logger.info("Model loaded\n")
        self.txt_logger.info("{}\n".format(self.acmodel))

        # some redundant code.  uncomment if there are issues and delete after enough testing
        #if 'model_state' in self.status:
        #    self.acmodel.load_state_dict(self.status['model_state'])
        #self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))

    def init_training_algo(self, num_envs=None):
        """
        Initialize the training algorithm.

        This primarily calls the object creation functions for the A2C or PPO2 and the optimizer, but this also spawns
        a number of parallel environments, based on the self.num_cpu or num_envs input (if provided).

        Note, the spawning of parallel environments is VERY slow due to deepcopying the termination sets.  I tried some
        work arounds, but nothing worked properly, so we are stuck with it for now.

        :param num_envs: an override for the default number of environments to spawn (in self.num_cpu)
        """
        if not num_envs:
            num_envs = self.num_cpu

        if self.model_type == "A2C":
            # check to make sure that the A2C parameters are set
            assert self.optim_alpha
            self.training_envs = [deepcopy(self.env) for i in range(num_envs)
                                  ]  # spawn parallel environments

            if self.acmodel.recurrent:
                self.memories = torch.zeros(num_envs,
                                            self.acmodel.memory_size,
                                            device=self.device)

            self.algo = torch_ac.A2CAlgo(
                self.training_envs, self.acmodel, self.device,
                self.frames_per_proc, self.discount, self.lr, self.gae_lambda,
                self.entropy_coef, self.value_loss_coef, self.max_grad_norm,
                self.recurrence, self.optim_alpha, self.optim_eps,
                self.preprocess_obss)
        elif self.model_type == "PPO2":
            # check to see if the PPO2 parameters are set
            assert self.clip_eps and self.epochs and self.batch_size
            self.training_envs = [deepcopy(self.env) for i in range(num_envs)
                                  ]  # spawn parallel environments

            if self.acmodel.recurrent:
                self.memories = torch.zeros(num_envs,
                                            self.acmodel.memory_size,
                                            device=self.device)

            self.algo = torch_ac.PPOAlgo(
                self.training_envs, self.acmodel, self.device,
                self.frames_per_proc, self.discount, self.lr, self.gae_lambda,
                self.entropy_coef, self.value_loss_coef, self.max_grad_norm,
                self.recurrence, self.optim_eps, self.clip_eps, self.epochs,
                self.batch_size, self.preprocess_obss)
        else:
            raise ValueError("Incorrect algorithm name: {}".format(algo_type))

        # load the optimizer state, if it exists
        if "optimizer_state" in self.status:
            self.algo.optimizer.load_state_dict(self.status["optimizer_state"])
        self.txt_logger.info("Optimizer loaded\n")

    def learn(self,
              total_timesteps,
              log_interval=1,
              save_interval=10,
              save_env_info=False,
              save_loc=None):
        """
        The primary training loop.

        :param total_timesteps: the total number of timesteps
        :param log_interval: the period between logging/printing updates
        :param save_interval: the number of updates between model saving
        :param save_env_info: if we save the environment info (termination set) VERY SLOW
        :return: True, if training is successful
        """
        self.init_training_algo(
        )  # initialize the training algo/environment list/optimizer

        if save_loc:
            print(
                'ignoring save_loc override.  if this is not intended, fix me')

        # initialize parameters
        self.num_frames = self.status["num_frames"]
        self.update = self.status["update"]
        start_time = time.time()

        # loop until we reach the desired number of timesteps
        while self.num_frames < total_timesteps:
            # Update model parameters

            update_start_time = time.time(
            )  # store the time (for fps calculations)
            exps, logs1 = self.algo.collect_experiences(
            )  # collect a number of data points for training
            logs2 = self.algo.update_parameters(
                exps)  # update the parameters based on the experiences
            logs = {**logs1, **logs2}  # merge the logs for printing
            update_end_time = time.time()

            self.num_frames += logs["num_frames"]
            self.update += 1

            # all of this messy stuff is just storing and printing the log info

            if self.update % log_interval == 0:
                fps = logs["num_frames"] / (update_end_time -
                                            update_start_time)
                duration = int(time.time() - start_time)
                return_per_episode = utils.synthesize(
                    logs["return_per_episode"])
                rreturn_per_episode = utils.synthesize(
                    logs["reshaped_return_per_episode"])
                num_frames_per_episode = utils.synthesize(
                    logs["num_frames_per_episode"])

                header = ["update", "frames", "FPS", "duration"]
                data = [self.update, self.num_frames, fps, duration]
                header += [
                    "rreturn_" + key for key in rreturn_per_episode.keys()
                ]
                data += rreturn_per_episode.values()
                header += [
                    "num_frames_" + key
                    for key in num_frames_per_episode.keys()
                ]
                data += num_frames_per_episode.values()
                header += [
                    "entropy", "value", "policy_loss", "value_loss",
                    "grad_norm"
                ]
                data += [
                    logs["entropy"], logs["value"], logs["policy_loss"],
                    logs["value_loss"], logs["grad_norm"]
                ]

                self.txt_logger.info(
                    "U {} | F {:06} | FPS {:04.0f} | D {} | rR:usmM {:.2f} {:.2f} {:.2f} {:.2f} | F:usmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | D {:.3f}"
                    .format(*data))

                header += [
                    "return_" + key for key in return_per_episode.keys()
                ]
                data += return_per_episode.values()

                if self.status["num_frames"] == 0:
                    self.csv_logger.writerow(header)
                self.csv_logger.writerow(data)
                self.csv_file.flush()

                for field, value in zip(header, data):
                    self.tb_writer.add_scalar(field, value, self.num_frames)

            # Save status

            if save_interval > 0 and self.update % save_interval == 0:
                self._save_training_info()
                if save_env_info:
                    for e in self.training_envs:
                        if hasattr(e, 'save_env_info'): e.save_env_info()

        self._clear_training_envs()

        return True

    def _save_training_info(self):
        """
        Function to save the training info.
        """

        # update the status dictionary
        self.status = {
            "num_frames": self.num_frames,
            "update": self.update,
            "model_state": self.acmodel.state_dict(),
            "optimizer_state": self.algo.optimizer.state_dict()
        }

        if hasattr(self.preprocess_obss,
                   "vocab"):  # if we are using NLP save, NLP info
            self.status["vocab"] = self.preprocess_obss.vocab.vocab

        utils.save_status(self.status,
                          self.model_dir)  # save the status info to model_dir
        self.txt_logger.info("Status saved")

    def _clear_training_envs(self):
        """
        Clear the training environments to free up memory.
        """

        # the termination set gets lost, so we need to store it again
        if hasattr(self.env, 'termination_set'):
            self.env.termination_set = [
                s for e in self.training_envs for s in e.termination_set
            ]

        # clear the env and the training envs
        self.algo.env = None
        self.training_envs = None

    def save(self, f):
        """
        Legacy function for saving the model.

        TODO: place the saving logic for the model here
        :param f:
        """
        print('self.save() - currently not implemented')

    def set_env(self, env):
        """
        Set the environment and clear the training environments

        :param env: environment for training/acting
        """
        # check to make sure the environment is the correct type
        assert isinstance(env, gym.Env)
        self.env = env
        self.training_envs = None

    def predict(self, obs, state=None, deterministic=False):
        """
        Wrapper for training code compatibility.  Calls get_action() to predict the action to take based on the
        current observation.

        :param obs: observation for predicting the action
        :param state: state of the LSTM (unused)
        :param deterministic: whether to use deterministic or probabilistic actions (unused)
        :return: action and LSTM state
        """
        # assert (state==None) and (deterministic==False) # still need to reimplement
        return self.get_action(
            obs
        ), None  # return action, states - states is unused at the moment

    def get_actions(self, obss):
        """
        Get a list of actions for a list of observations.



        :param obss: list of observations for predicting actions
        :return: list of actions for the associated observations
        """
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad(
        ):  # don't calculate the gradients, since we are doing a forward pass
            if self.acmodel.recurrent:  # if we are using a recurrent model
                dist, _, self.memories = self.acmodel(preprocessed_obss,
                                                      self.memories)
            else:  # otherwise
                dist, _ = self.acmodel(preprocessed_obss)
                # preprocess the observations to put them in a torch-friendly format

        # the acmodel returns a probability distribution
        if self.argmax:  # if we are detemrinistic, take the action with the highest probability
            actions = dist.probs.max(1, keepdim=True)[1]
        else:  # otherwise sample the distribution to select the action
            actions = dist.sample()

        return actions.cpu().numpy()  # reaturn a numpy array, not a tensor

    def get_action(self, obs):
        """
        Wrapper for get_actions() to produce just a single action (rather than a list of actions) for acting.

        :param obs: single observation
        :return: single action
        """
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        """
        rl-starter-files code.  Not sure what this does.

        :param rewards:
        :param dones:
        """
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(
                dones, dtype=torch.float, device=self.device).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        """
        rl-starter-files code.  Not sure what this does (other than wrap analyze_feedbacks().

        :param reward:
        :param done:
        :return:
        """
        return self.analyze_feedbacks([reward], [done])
Ejemplo n.º 8
0
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""
    def __init__(self,
                 env,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_memory=False,
                 use_text=False):
        obs_space, self.preprocess_obs_goals = utils.get_obs_goals_preprocessor(
            obs_space)
        self.acmodel = ACModel(obs_space,
                               action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        status = utils.get_status(model_dir)

        self.goals = list(status['agent_goals'].values())
        # for goal in self.goals:
        #     goal = env.unwrapped.get_obs_render( goal, tile_size=32)
        #     plt.imshow(goal)
        #     plt.show()

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size,
                                        device=self.device)

        self.acmodel.load_state_dict(status["model_state"])
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obs_goals, "vocab"):
            self.preprocess_obs_goals.vocab.load_vocab(status["vocab"])

    def concat_obs_goal(self, obs):
        if 'image' in obs:
            obs_goals = [{
                "image":
                np.concatenate((obs["image"], self.goals[i]), axis=2),
                "mission":
                obs['mission']
            } for i in range(len(self.goals))]
        else:
            obs_goals = [
                np.concatenate((obs, self.goals[i]), axis=2)
                for i in range(len(self.goals))
            ]
        return obs_goals

    def get_actions(self, obss):
        actions = np.zeros(len(obss), dtype=int)

        for i in range(len(obss)):
            memory = self.memories[i]

            obs_goals = self.concat_obs_goal(obss[i])
            preprocessed_obs_goals = self.preprocess_obs_goals(
                obs_goals, device=self.device)

            with torch.no_grad():
                if self.acmodel.recurrent:
                    memory = torch.stack([memory] * len(self.goals), 0)
                    dists, values, memory = self.acmodel(
                        preprocessed_obs_goals, memory)
                else:
                    dists, values = self.acmodel(preprocessed_obs_goals)
            g = values.data.max(0)[1]
            print(values.data, g)
            if self.argmax:
                actions[i] = dists.probs.max(1,
                                             keepdim=True)[1][g].cpu().numpy()
            else:
                actions[i] = dists.sample()[g].cpu().numpy()

            if self.acmodel.recurrent:
                self.memories[i] = memory[g]

        return actions

    def reset(self):
        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size,
                                        device=self.device)

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(
                dones, dtype=torch.float, device=self.device).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
Ejemplo n.º 9
0
class Agent:
    """An agent.

    It is able:
    - to choose an action given an observation,
    - to analyze the feedback (i.e. reward and done state) of its action."""
    def __init__(self,
                 env,
                 obs_space,
                 action_space,
                 model_dir,
                 ignoreLTL,
                 progression_mode,
                 gnn,
                 recurrence=1,
                 dumb_ac=False,
                 device=None,
                 argmax=False,
                 num_envs=1):
        try:
            print(model_dir)
            status = utils.get_status(model_dir)
        except OSError:
            status = {"num_frames": 0, "update": 0}

        using_gnn = (gnn != "GRU" and gnn != "LSTM")
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            env, using_gnn, progression_mode)
        if "vocab" in status and self.preprocess_obss.vocab is not None:
            self.preprocess_obss.vocab.load_vocab(status["vocab"])

        if recurrence > 1:
            self.acmodel = RecurrentACModel(env, obs_space, action_space,
                                            ignoreLTL, gnn, dumb_ac, True)
            self.memories = torch.zeros(num_envs,
                                        self.acmodel.memory_size,
                                        device=device)
        else:
            self.acmodel = ACModel(env, obs_space, action_space, ignoreLTL,
                                   gnn, dumb_ac, True)

        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()

    def get_actions(self, obss):
        preprocessed_obss = self.preprocess_obss(obss, device=self.device)

        with torch.no_grad():
            if self.acmodel.recurrent:
                dist, _, self.memories = self.acmodel(preprocessed_obss,
                                                      self.memories)
            else:
                dist, _ = self.acmodel(preprocessed_obss)

        if self.argmax:
            actions = dist.probs.max(1, keepdim=True)[1]
        else:
            actions = dist.sample()

        return actions.cpu().numpy()

    def get_action(self, obs):
        return self.get_actions([obs])[0]

    def analyze_feedbacks(self, rewards, dones):
        if self.acmodel.recurrent:
            masks = 1 - torch.tensor(dones, dtype=torch.float).unsqueeze(1)
            self.memories *= masks

    def analyze_feedback(self, reward, done):
        return self.analyze_feedbacks([reward], [done])
def tuner(icm_lr, reward_weighting, normalise_rewards, args):
    import argparse
    import datetime
    import torch
    import torch_ac
    import tensorboardX
    import sys
    import numpy as np
    from model import ACModel
    from .a2c import A2CAlgo

    # from .ppo import PPOAlgo

    frames_to_visualise = 200
    # Parse arguments

    args.mem = args.recurrence > 1

    def make_exploration_heatmap(args, plot_title):
        import numpy as np
        import matplotlib.pyplot as plt

        visitation_counts = np.load(
            f"{args.model}_visitation_counts.npy", allow_pickle=True
        )
        plot_title = str(np.count_nonzero(visitation_counts)) + args.model
        plt.imshow(np.log(visitation_counts))
        plt.colorbar()
        plt.title(plot_title)
        plt.savefig(f"{plot_title}_visitation_counts.png")

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"
    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    utils.seed(args.seed)

    # Set device

    device = "cpu"  # torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")
    # Load environments

    envs = []

    for i in range(16):
        an_env = utils.make_env(
            args.env, int(args.frames_before_reset), int(args.environment_seed)
        )
        envs.append(an_env)
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    # adapted from impact driven RL
    from .models import AutoencoderWithUncertainty

    autoencoder = AutoencoderWithUncertainty(observation_shape=(7, 7, 3)).to(device)

    autoencoder_opt = torch.optim.Adam(
        autoencoder.parameters(), lr=icm_lr, weight_decay=0
    )
    if args.algo == "a2c":
        algo = A2CAlgo(
            envs,
            acmodel,
            autoencoder,
            autoencoder_opt,
            args.uncertainty,
            args.noisy_tv,
            args.curiosity,
            args.randomise_env,
            args.uncertainty_budget,
            args.environment_seed,
            reward_weighting,
            normalise_rewards,
            args.frames_before_reset,
            device,
            args.frames_per_proc,
            args.discount,
            args.lr,
            args.gae_lambda,
            args.entropy_coef,
            args.value_loss_coef,
            args.max_grad_norm,
            args.recurrence,
            args.optim_alpha,
            args.optim_eps,
            preprocess_obss,
            None,
            args.random_action,
        )
    elif args.algo == "ppo":
        algo = PPOAlgo(
            envs,
            acmodel,
            autoencoder,
            autoencoder_opt,
            args.uncertainty,
            args.noisy_tv,
            args.curiosity,
            args.randomise_env,
            args.uncertainty_budget,
            args.environment_seed,
            reward_weighting,
            normalise_rewards,
            device,
            args.frames_per_proc,
            args.discount,
            args.lr,
            args.gae_lambda,
            args.entropy_coef,
            args.value_loss_coef,
            args.max_grad_norm,
            args.recurrence,
            args.optim_eps,
            args.clip_eps,
            args.epochs,
            args.batch_size,
            preprocess_obss,
        )

    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        update_start_time = time.time()
        exps, logs1 = algo.collect_experiences()
        logs2 = algo.update_parameters(exps)
        logs = {**logs1, **logs2}
        update_end_time = time.time()

        num_frames += logs["num_frames"]
        update += 1

        log_to_wandb(logs, start_time, update_start_time, update_end_time)

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])
            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += ["num_frames_" + key for key in num_frames_per_episode.keys()]
            data += num_frames_per_episode.values()
            header += [
                "intrinsic_rewards",
                "uncertainties",
                "novel_states_visited",
                "entropy",
                "value",
                "policy_loss",
                "value_loss",
                "grad_norm",
            ]
            data += [
                logs["intrinsic_rewards"].mean().item(),
                logs["uncertainties"].mean().item(),
                logs["novel_states_visited"].mean().item(),
                logs["entropy"],
                logs["value"],
                logs["policy_loss"],
                logs["value_loss"],
                logs["grad_norm"],
            ]
            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f}".format(
                    *data
                )
            )
        # Save status
        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {
                "num_frames": num_frames,
                "update": update,
                "model_state": acmodel.state_dict(),
                "optimizer_state": algo.optimizer.state_dict(),
            }
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
    return