def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1, use_memory=False, use_text=False): obs_space, self.preprocess_obss = utils.get_obss_preprocessor( obs_space) self.acmodel = ACModel(obs_space, action_space, use_memory=use_memory, use_text=use_text) self.device = device self.argmax = argmax self.num_envs = num_envs if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size) self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
def create_model(obs_space, envs): """Helper function to create new model faster.""" cuda = torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") model = ACModel(obs_space, envs[0].action_space) model = model.to(device) return model
def __init__(self, learning_rate, discount, action_space, *args, **kwargs): super().__init__(*args, **kwargs) self.learning_rate = learning_rate self.discount = discount self.action = None self.ac_model = ACModel(action_space) self.ac_model.compile(optimizer=Adam(learning_rate=learning_rate))
def __init__(self, talker): super(ACBrain, self).__init__() self.model = ACModel() self.model.build((None, IMG_H, IMG_W, k)) self.talker = talker self.i = 1 self.optimizer = optim.Adam(learning_rate=CustomSchedule(lr)) self.states_list = self.talker.states_list self.memory = [] self.one_episode_reward_index = 0
def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1): obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space) self.acmodel = ACModel(obs_space, action_space) self.device = device self.argmax = argmax self.num_envs = num_envs self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval()
def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1): obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space) self.acmodel = ACModel(obs_space, action_space) self.device = device self.argmax = argmax self.num_envs = num_envs if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size) self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval()
def __init__(self, env, obs_space, action_space, model_dir, ignoreLTL, progression_mode, gnn, recurrence=1, dumb_ac=False, device=None, argmax=False, num_envs=1): try: print(model_dir) status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} using_gnn = (gnn != "GRU" and gnn != "LSTM") obs_space, self.preprocess_obss = utils.get_obss_preprocessor( env, using_gnn, progression_mode) if "vocab" in status and self.preprocess_obss.vocab is not None: self.preprocess_obss.vocab.load_vocab(status["vocab"]) if recurrence > 1: self.acmodel = RecurrentACModel(env, obs_space, action_space, ignoreLTL, gnn, dumb_ac, True) self.memories = torch.zeros(num_envs, self.acmodel.memory_size, device=device) else: self.acmodel = ACModel(env, obs_space, action_space, ignoreLTL, gnn, dumb_ac, True) self.device = device self.argmax = argmax self.num_envs = num_envs self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval()
class Agent: """An agent. It is able: - to choose an action given an observation, - to analyze the feedback (i.e. reward and done state) of its action.""" def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1, use_rim=False): obs_space, self.preprocess_obss = utils.get_obss_preprocessor( obs_space) self.acmodel = ACModel(obs_space, action_space, use_rim=use_rim) self.device = device self.argmax = argmax self.num_envs = num_envs if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size).to(device) self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir)) def get_actions(self, obss): preprocessed_obss = self.preprocess_obss(obss, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, _, self.memories = self.acmodel(preprocessed_obss, self.memories) else: dist, _ = self.acmodel(preprocessed_obss) if self.argmax: actions = dist.probs.max(1, keepdim=True)[1] else: actions = dist.sample() return actions.cpu().numpy() def get_action(self, obs): return self.get_actions([obs])[0] def analyze_feedbacks(self, rewards, dones): if self.acmodel.recurrent: masks = 1 - torch.tensor(dones, dtype=torch.float).to( self.device).unsqueeze(1) self.memories *= masks def analyze_feedback(self, reward, done): return self.analyze_feedbacks([reward], [done])
def __init__(self, env, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1, use_memory=False, use_text=False): obs_space, self.preprocess_obs_goals = utils.get_obs_goals_preprocessor( obs_space) self.acmodel = ACModel(obs_space, action_space, use_memory=use_memory, use_text=use_text) self.device = device self.argmax = argmax self.num_envs = num_envs status = utils.get_status(model_dir) self.goals = list(status['agent_goals'].values()) # for goal in self.goals: # goal = env.unwrapped.get_obs_render( goal, tile_size=32) # plt.imshow(goal) # plt.show() if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size, device=self.device) self.acmodel.load_state_dict(status["model_state"]) self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obs_goals, "vocab"): self.preprocess_obs_goals.vocab.load_vocab(status["vocab"])
class ACAgent(AgentBase): def __init__(self, learning_rate, discount, action_space, *args, **kwargs): super().__init__(*args, **kwargs) self.learning_rate = learning_rate self.discount = discount self.action = None self.ac_model = ACModel(action_space) self.ac_model.compile(optimizer=Adam(learning_rate=learning_rate)) def before(self, *args, **kwargs): pass def after(self, *args, **kwargs): pass def act(self, state) -> int: state = tf.convert_to_tensor([state]) _, probs = self.ac_model(state) action_probs = tfp.distributions.Categorical(probs=probs) self.action = action_probs.sample() return self.action.numpy().item() def learn(self, *args, **kwargs): state = tf.convert_to_tensor([kwargs['state']], dtype=tf.float32) next_state = tf.convert_to_tensor([kwargs['next_state']], dtype=tf.float32) reward = tf.convert_to_tensor([kwargs['reward']], dtype=tf.float32) done = kwargs['done'] with tf.GradientTape(persistent=False) as tape: state_val, probs = self.ac_model(state) next_state_val, _ = self.ac_model(next_state) state_val = tf.squeeze(state_val) next_state_val = tf.squeeze(next_state_val) action_probs = tfp.distributions.Categorical(probs=probs) log_prob = action_probs.log_prob(self.action) exp_val = reward + self.discount * next_state_val * ( 1 - int(done)) - state_val actor_loss = -log_prob * exp_val critic_loss = exp_val**2 total_loss = actor_loss + critic_loss gradient = tape.gradient(total_loss, self.ac_model.trainable_variables) self.ac_model.optimizer.apply_gradients( zip(gradient, self.ac_model.trainable_variables)) def save_model(self): self.ac_model.save_weights("path/to/file") def load_model(self): self.ac_model.load_weights("path/to/file")
def train(model_type, batch_size, sequence_length, frame_shape): model = ACModel(model_type, input_shape = (20, 120, 120, 3)) data = DataSet(sequence_length, frame_shape) checkpoint = ModelCheckpoint(filepath = os.path.join('CheckPoints', (model_type + '-.{epoch:03d}-{val_loss:.3f}.hdf5')), verbose = 1, save_best_only = True) tensorBoard = TensorBoard(log_dir = os.path.join('CheckPoints', 'logs', model_type)) if 'parallel' not in model_type: tri_generator = data.generator('train', 'fn', batch_size) val_generator = data.generator('test', 'fn', batch_size) else: tri_generator = data.parallel_generator('train', batch_size) val_generator = data.parallel_generator('test', batch_size) model.model.fit_generator(generator = tri_generator, steps_per_epoch = data.size('train') // batch_size, epochs = epochs, verbose = 1, callbacks = [tensorBoard, checkpoint], validation_data = val_generator, validation_steps = 4, workers = 1)
class Agent: """An agent. It is able: - to choose an action given an observation, - to analyze the feedback (i.e. reward and done state) of its action.""" def __init__(self, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1): obs_space, self.preprocess_obss = utils.get_obss_preprocessor(obs_space) self.acmodel = ACModel(obs_space, action_space) self.device = device self.argmax = argmax self.num_envs = num_envs self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval() def get_actions(self, obss): preprocessed_obss = self.preprocess_obss(obss, device=self.device) with torch.no_grad(): dist, _ = self.acmodel(preprocessed_obss) if self.argmax: actions = dist.probs.max(1, keepdim=True)[1] else: actions = dist.sample() return actions.cpu().numpy() def get_action(self, obs): return self.get_actions([obs])[0] def analyze_feedbacks(self, rewards, dones): pass def analyze_feedback(self, reward, done): return self.analyze_feedbacks([reward], [done])
def __init__(self, env, model_dir, model_type='PPO2', logger=None, argmax=False, use_memory=False, use_text=False, num_cpu=1, frames_per_proc=None, discount=0.99, lr=0.001, gae_lambda=0.95, entropy_coef=0.01, value_loss_coef=0.5, max_grad_norm=0.5, recurrence=1, optim_eps=1e-8, optim_alpha=None, clip_eps=0.2, epochs=4, batch_size=256): """ Initialize the Agent object. This primarily includes storing of the configuration parameters, but there is some other logic for correctly initializing the agent. :param env: the environment for training :param model_dir: the save directory (appended with the goal_id in initialization) :param model_type: the type of model {'PPO2', 'A2C'} :param logger: existing text logger :param argmax: if we use determinsitic or probabilistic action selection :param use_memory: if we are using an LSTM :param use_text: if we are using NLP to parse the goal :param num_cpu: the number of parallel instances for training :param frames_per_proc: max time_steps per process (versus constant) :param discount: the discount factor (gamma) :param lr: the learning rate :param gae_lambda: the generalized advantage estimator lambda parameter (training smoothing parameter) :param entropy_coef: relative weight for entropy loss :param value_loss_coef: relative weight for value function loss :param max_grad_norm: max scaling factor for the gradient :param recurrence: number of recurrent steps :param optim_eps: minimum value to prevent numerical instability :param optim_alpha: RMSprop decay parameter (A2C only) :param clip_eps: clipping parameter for the advantage and value function (PPO2 only) :param epochs: number of epochs in the parameter update (PPO2 only) :param batch_size: number of samples for the parameter update (PPO2 only) """ if hasattr( env, 'goal' ) and env.goal: # if the environment has a goal, set the model_dir to the goal folder self.model_dir = model_dir + env.goal.goalId + '/' else: # otherwise just use the model_dir as is self.model_dir = model_dir # store all of the input parameters self.model_type = model_type self.num_cpu = num_cpu self.frames_per_proc = frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.optim_eps = optim_eps self.optim_alpha = optim_alpha self.clip_eps = clip_eps self.epochs = epochs self.batch_size = batch_size # use the existing logger and create two new ones self.txt_logger = logger self.csv_file, self.csv_logger = utils.get_csv_logger(self.model_dir) self.tb_writer = tensorboardX.SummaryWriter(self.model_dir) self.set_env( env ) # set the environment to with some additional checks and init of training_envs self.algo = None # we don't initialize the algorithm until we call init_training_algo() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.txt_logger.info(f"Device: {device}\n") try: # if we have a saved model, load it self.status = utils.get_status(self.model_dir) except OSError: # otherwise initialize the status print('error loading saved model. initializing empty model...') self.status = {"num_frames": 0, "update": 0} if self.txt_logger: self.txt_logger.info("Training status loaded\n") if "vocab" in self.status: preprocess_obss.vocab.load_vocab(self.status["vocab"]) if self.txt_logger: self.txt_logger.info("Observations preprocessor loaded") # get the obs_space and the observation pre-processor # (for manipulating gym observations into a torch-friendly format) obs_space, self.preprocess_obss = utils.get_obss_preprocessor( self.env.observation_space) self.acmodel = ACModel(obs_space, self.env.action_space, use_memory=use_memory, use_text=use_text) self.device = device # store the device {'cpu', 'cuda:N'} self.argmax = argmax # if we are using greedy action selection # or are we using probabilistic action selection if self.acmodel.recurrent: # initialize the memories self.memories = torch.zeros(num_cpu, self.acmodel.memory_size, device=self.device) if "model_state" in self.status: # if we have a saved model ('model_state') in the status # load that into the initialized model self.acmodel.load_state_dict(self.status["model_state"]) self.acmodel.to( device) # make sure the model is located on the correct device self.txt_logger.info("Model loaded\n") self.txt_logger.info("{}\n".format(self.acmodel)) # some redundant code. uncomment if there are issues and delete after enough testing #if 'model_state' in self.status: # self.acmodel.load_state_dict(self.status['model_state']) #self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) # TODO if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo":
def main(): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument( "--algo", required=True, help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)") parser.add_argument("--env", required=True, help="name of the environment to train on (REQUIRED)") parser.add_argument( "--model", default=None, help="name of the model (default: {ENV}_{ALGO}_{TIME})") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") parser.add_argument("--visualize", default=False, help="show real time CNN layer weight changes") args = parser.parse_args() args.mem = args.recurrence > 1 # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}" model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments envs = [] for i in range(args.procs): envs.append(utils.make_env(args.env, args.seed + 10000 * i)) txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "ppo_intrinsic": algo = torch_ac.PPOAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "a2c_intrinsic": algo = torch_ac.A2CAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() print_visual = args.visualize if print_visual: fig, axs = plt.subplots(1, 3) fig.suptitle('Convolution Layer Weights Normalized Difference') while num_frames < args.frames: # Store copies of s_t model params old_parameters = {} for name, param in acmodel.named_parameters(): old_parameters[name] = param.detach().numpy().copy() # Update model parameters update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() # Store copies of s_t+1 model params new_parameters = {} for name, param in acmodel.named_parameters(): new_parameters[name] = param.detach().numpy().copy() # Compute L2 Norm of model state differences # Print model weight change visualization for index in range(len(old_parameters.keys())): if index == 0 or index == 2 or index == 4: key = list(old_parameters.keys())[index] old_weights = old_parameters[key] new_weights = new_parameters[key] norm_diff = numpy.linalg.norm(new_weights - old_weights) diff_matrix = abs(new_weights - old_weights) diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0], norm='max', axis=0) if print_visual: axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0], cmap='Greens', interpolation='nearest') # This allows the plots to update as the model trains if print_visual: plt.ion() plt.show() plt.pause(0.001) num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
def main(raw_args=None): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument("--algo", required=True, help="algorithm to use: a2c | ppo | ipo (REQUIRED)") parser.add_argument("--domain1", required=True, help="name of the first domain to train on (REQUIRED)") parser.add_argument( "--domain2", required=True, help="name of the second domain to train on (REQUIRED)") parser.add_argument( "--p1", required=True, type=float, help="Proportion of training environments from first domain (REQUIRED)" ) parser.add_argument("--model", required=True, help="name of the model") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") args = parser.parse_args(raw_args) args.mem = args.recurrence > 1 # Check PyTorch version if (torch.__version__ != '1.2.0'): raise ValueError( "PyTorch version must be 1.2.0 (see README). Your version is {}.". format(torch.__version__)) if args.mem: raise ValueError("Policies with memory not supported.") # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = args.model model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources torch.backends.cudnn.deterministic = True utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments from different domains domain1 = args.domain1 # e.g., 'MiniGrid-ColoredKeysRed-v0' domain2 = args.domain2 # e.g., 'MiniGrid-ColoredKeysYellow-v0' p1 = args.p1 # Proportion of environments from domain1 num_envs_total = args.procs # Total number of environments num_domain1 = math.ceil( p1 * num_envs_total) # Number of environments in domain1 num_domain2 = num_envs_total - num_domain1 # Number of environments in domain2 # Environments from domain1 envs1 = [] for i in range(num_domain1): envs1.append(utils.make_env(domain1, args.seed + 10000 * i)) # Environments from domain2 envs2 = [] for i in range(num_domain2): envs2.append(utils.make_env(domain2, args.seed + 10000 * i)) # All environments envs = envs1 + envs2 txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") if args.algo == "ipo": # Load model for IPO game acmodel = ACModel_average(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) else: # Load model (for standard PPO or A2C) acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ipo": # One algo per domain. These have different envivonments, but shared acmodel algo1 = torch_ac.IPOAlgo( envs1, acmodel, 1, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) algo2 = torch_ac.IPOAlgo( envs2, acmodel, 2, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state1" in status: algo1.optimizer.load_state_dict(status["optimizer_state1"]) txt_logger.info("Optimizer 1 loaded\n") if "optimizer_state2" in status: algo2.optimizer.load_state_dict(status["optimizer_state2"]) txt_logger.info("Optimizer 2 loaded\n") else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < args.frames: # Update model parameters update_start_time = time.time() if args.algo == "ipo": # Standard method # Collect experiences on first domain exps1, logs_exps1 = algo1.collect_experiences() # Update params of model corresponding to first domain logs_algo1 = algo1.update_parameters(exps1) # Collect experiences on second domain exps2, logs_exps2 = algo2.collect_experiences() # Update params of model corresponding to second domain logs_algo2 = algo2.update_parameters(exps2) # Update end time update_end_time = time.time() # Combine logs logs_exps = { 'return_per_episode': logs_exps1["return_per_episode"] + logs_exps2["return_per_episode"], 'reshaped_return_per_episode': logs_exps1["reshaped_return_per_episode"] + logs_exps2["reshaped_return_per_episode"], 'num_frames_per_episode': logs_exps1["num_frames_per_episode"] + logs_exps2["num_frames_per_episode"], 'num_frames': logs_exps1["num_frames"] + logs_exps2["num_frames"] } logs_algo = { 'entropy': (num_domain1 * logs_algo1["entropy"] + num_domain2 * logs_algo2["entropy"]) / num_envs_total, 'value': (num_domain1 * logs_algo1["value"] + num_domain2 * logs_algo2["value"]) / num_envs_total, 'policy_loss': (num_domain1 * logs_algo1["policy_loss"] + num_domain2 * logs_algo2["policy_loss"]) / num_envs_total, 'value_loss': (num_domain1 * logs_algo1["value_loss"] + num_domain2 * logs_algo2["value_loss"]) / num_envs_total, 'grad_norm': (num_domain1 * logs_algo1["grad_norm"] + num_domain2 * logs_algo2["grad_norm"]) / num_envs_total } logs = {**logs_exps, **logs_algo} num_frames += logs["num_frames"] else: exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() # header += ["debug_last_env_reward"] # data += [logs["debug_last_env_reward"]] header += ["total_loss"] data += [ logs["policy_loss"] - args.entropy_coef * logs["entropy"] + args.value_loss_coef * logs["value_loss"] ] if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: if args.algo == "ipo": status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state1": algo1.optimizer.state_dict(), "optimizer_state2": algo2.optimizer.state_dict() } else: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
class Agent: """An agent. It is able: - to choose an action given an observation, - to analyze the feedback (i.e. reward and done state) of its action.""" def __init__(self, env, obs_space, action_space, model_dir, ignoreLTL, progression_mode, gnn, recurrence=1, dumb_ac=False, device=None, argmax=False, num_envs=1): try: print(model_dir) status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} using_gnn = (gnn != "GRU" and gnn != "LSTM") obs_space, self.preprocess_obss = utils.get_obss_preprocessor( env, using_gnn, progression_mode) if "vocab" in status and self.preprocess_obss.vocab is not None: self.preprocess_obss.vocab.load_vocab(status["vocab"]) if recurrence > 1: self.acmodel = RecurrentACModel(env, obs_space, action_space, ignoreLTL, gnn, dumb_ac, True) self.memories = torch.zeros(num_envs, self.acmodel.memory_size, device=device) else: self.acmodel = ACModel(env, obs_space, action_space, ignoreLTL, gnn, dumb_ac, True) self.device = device self.argmax = argmax self.num_envs = num_envs self.acmodel.load_state_dict(utils.get_model_state(model_dir)) self.acmodel.to(self.device) self.acmodel.eval() def get_actions(self, obss): preprocessed_obss = self.preprocess_obss(obss, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, _, self.memories = self.acmodel(preprocessed_obss, self.memories) else: dist, _ = self.acmodel(preprocessed_obss) if self.argmax: actions = dist.probs.max(1, keepdim=True)[1] else: actions = dist.sample() return actions.cpu().numpy() def get_action(self, obs): return self.get_actions([obs])[0] def analyze_feedbacks(self, rewards, dones): if self.acmodel.recurrent: masks = 1 - torch.tensor(dones, dtype=torch.float).unsqueeze(1) self.memories *= masks def analyze_feedback(self, reward, done): return self.analyze_feedbacks([reward], [done])
# Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(envs[0].observation_space, envs[0].action_space, memory, False) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if algorithm == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, 5, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, optim_alpha, optim_eps, preprocess_obss) elif algorithm == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, 128, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence,
try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model try: acmodel = utils.load_model(model_dir) logger.info("Model successfully loaded\n") except OSError: acmodel = ACModel(obs_space, envs[0].action_space, args.model_type, use_bottleneck=args.use_bottleneck, dropout=args.use_dropout, use_l2a=args.use_l2a, use_bn=args.use_bn, sni_type=args.sni_type) logger.info("Model successfully created\n") logger.info("{}\n".format(acmodel)) if torch.cuda.is_available(): acmodel.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # Define actor-critic algo # a2c does not yet support the bottleneck assert args.algo == "ppo"
try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text, args.use_rim) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model try: acmodel = utils.load_model(model_dir) logger.info("Model successfully loaded\n") except OSError: acmodel = ACModel(obs_space, envs[0].action_space, args.model_type, use_bottleneck=args.use_bottleneck, dropout=args.use_dropout, use_l2a=args.use_l2a, use_bn=args.use_bn, sni_type=args.sni_type, flow=args.flow, n_flows=args.n_flows, num_latent_channels=args.num_latent_channels) logger.info("Model successfully created\n") logger.info("{}\n".format(acmodel)) if torch.cuda.is_available(): acmodel.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # Define actor-critic algo
try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model try: base_model = utils.load_model(model_dir) logger.info("Model successfully loaded\n") except OSError: if args.algo == "dqn": base_model = DQNModel(obs_space, envs[0].action_space, args.mem, args.text) else: base_model = ACModel(obs_space, envs[0].action_space, args.mem, args.text) logger.info("Model successfully created\n") logger.info("{}\n".format(base_model)) if torch.cuda.is_available(): base_model.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # Train model num_frames = status["num_frames"] total_start_time = time.time() update = status["update"] best_val = 0 if args.algo == "a2c":
env = gym.make(args.env) env.seed(args.seed + 10000 * i) envs.append(env) # Define obss preprocessor preprocess_obss = utils.ObssPreprocessor(save_dir, envs[0].observation_space) # Define actor-critic model if utils.model_exists(save_dir): acmodel = utils.load_model(save_dir) status = utils.load_status(save_dir) logger.info("Model successfully loaded\n") else: acmodel = ACModel(preprocess_obss.obs_space, envs[0].action_space, not args.no_instr, not args.no_mem) status = {"num_frames": 0, "update": 0} logger.info("Model successfully created\n") logger.info("{}\n".format(acmodel)) if torch.cuda.is_available(): acmodel.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # Define actor-critic algo if args.algo == "a2c": algo = torch_rl.A2CAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps,
obs_space, preprocess_obss = utils.get_obss_preprocessor(args.env, envs[0].observation_space, model_dir) # Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model try: acmodel = utils.load_model(model_dir) logger.info("Model successfully loaded\n") except OSError: acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) logger.info("Model successfully created\n") logger.info("{}\n".format(acmodel)) if torch.cuda.is_available(): acmodel.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # Define actor-critic algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
class Agent: def __init__(self, env, model_dir, model_type='PPO2', logger=None, argmax=False, use_memory=False, use_text=False, num_cpu=1, frames_per_proc=None, discount=0.99, lr=0.001, gae_lambda=0.95, entropy_coef=0.01, value_loss_coef=0.5, max_grad_norm=0.5, recurrence=1, optim_eps=1e-8, optim_alpha=None, clip_eps=0.2, epochs=4, batch_size=256): """ Initialize the Agent object. This primarily includes storing of the configuration parameters, but there is some other logic for correctly initializing the agent. :param env: the environment for training :param model_dir: the save directory (appended with the goal_id in initialization) :param model_type: the type of model {'PPO2', 'A2C'} :param logger: existing text logger :param argmax: if we use determinsitic or probabilistic action selection :param use_memory: if we are using an LSTM :param use_text: if we are using NLP to parse the goal :param num_cpu: the number of parallel instances for training :param frames_per_proc: max time_steps per process (versus constant) :param discount: the discount factor (gamma) :param lr: the learning rate :param gae_lambda: the generalized advantage estimator lambda parameter (training smoothing parameter) :param entropy_coef: relative weight for entropy loss :param value_loss_coef: relative weight for value function loss :param max_grad_norm: max scaling factor for the gradient :param recurrence: number of recurrent steps :param optim_eps: minimum value to prevent numerical instability :param optim_alpha: RMSprop decay parameter (A2C only) :param clip_eps: clipping parameter for the advantage and value function (PPO2 only) :param epochs: number of epochs in the parameter update (PPO2 only) :param batch_size: number of samples for the parameter update (PPO2 only) """ if hasattr( env, 'goal' ) and env.goal: # if the environment has a goal, set the model_dir to the goal folder self.model_dir = model_dir + env.goal.goalId + '/' else: # otherwise just use the model_dir as is self.model_dir = model_dir # store all of the input parameters self.model_type = model_type self.num_cpu = num_cpu self.frames_per_proc = frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.optim_eps = optim_eps self.optim_alpha = optim_alpha self.clip_eps = clip_eps self.epochs = epochs self.batch_size = batch_size # use the existing logger and create two new ones self.txt_logger = logger self.csv_file, self.csv_logger = utils.get_csv_logger(self.model_dir) self.tb_writer = tensorboardX.SummaryWriter(self.model_dir) self.set_env( env ) # set the environment to with some additional checks and init of training_envs self.algo = None # we don't initialize the algorithm until we call init_training_algo() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.txt_logger.info(f"Device: {device}\n") try: # if we have a saved model, load it self.status = utils.get_status(self.model_dir) except OSError: # otherwise initialize the status print('error loading saved model. initializing empty model...') self.status = {"num_frames": 0, "update": 0} if self.txt_logger: self.txt_logger.info("Training status loaded\n") if "vocab" in self.status: preprocess_obss.vocab.load_vocab(self.status["vocab"]) if self.txt_logger: self.txt_logger.info("Observations preprocessor loaded") # get the obs_space and the observation pre-processor # (for manipulating gym observations into a torch-friendly format) obs_space, self.preprocess_obss = utils.get_obss_preprocessor( self.env.observation_space) self.acmodel = ACModel(obs_space, self.env.action_space, use_memory=use_memory, use_text=use_text) self.device = device # store the device {'cpu', 'cuda:N'} self.argmax = argmax # if we are using greedy action selection # or are we using probabilistic action selection if self.acmodel.recurrent: # initialize the memories self.memories = torch.zeros(num_cpu, self.acmodel.memory_size, device=self.device) if "model_state" in self.status: # if we have a saved model ('model_state') in the status # load that into the initialized model self.acmodel.load_state_dict(self.status["model_state"]) self.acmodel.to( device) # make sure the model is located on the correct device self.txt_logger.info("Model loaded\n") self.txt_logger.info("{}\n".format(self.acmodel)) # some redundant code. uncomment if there are issues and delete after enough testing #if 'model_state' in self.status: # self.acmodel.load_state_dict(self.status['model_state']) #self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obss, "vocab"): self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir)) def init_training_algo(self, num_envs=None): """ Initialize the training algorithm. This primarily calls the object creation functions for the A2C or PPO2 and the optimizer, but this also spawns a number of parallel environments, based on the self.num_cpu or num_envs input (if provided). Note, the spawning of parallel environments is VERY slow due to deepcopying the termination sets. I tried some work arounds, but nothing worked properly, so we are stuck with it for now. :param num_envs: an override for the default number of environments to spawn (in self.num_cpu) """ if not num_envs: num_envs = self.num_cpu if self.model_type == "A2C": # check to make sure that the A2C parameters are set assert self.optim_alpha self.training_envs = [deepcopy(self.env) for i in range(num_envs) ] # spawn parallel environments if self.acmodel.recurrent: self.memories = torch.zeros(num_envs, self.acmodel.memory_size, device=self.device) self.algo = torch_ac.A2CAlgo( self.training_envs, self.acmodel, self.device, self.frames_per_proc, self.discount, self.lr, self.gae_lambda, self.entropy_coef, self.value_loss_coef, self.max_grad_norm, self.recurrence, self.optim_alpha, self.optim_eps, self.preprocess_obss) elif self.model_type == "PPO2": # check to see if the PPO2 parameters are set assert self.clip_eps and self.epochs and self.batch_size self.training_envs = [deepcopy(self.env) for i in range(num_envs) ] # spawn parallel environments if self.acmodel.recurrent: self.memories = torch.zeros(num_envs, self.acmodel.memory_size, device=self.device) self.algo = torch_ac.PPOAlgo( self.training_envs, self.acmodel, self.device, self.frames_per_proc, self.discount, self.lr, self.gae_lambda, self.entropy_coef, self.value_loss_coef, self.max_grad_norm, self.recurrence, self.optim_eps, self.clip_eps, self.epochs, self.batch_size, self.preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(algo_type)) # load the optimizer state, if it exists if "optimizer_state" in self.status: self.algo.optimizer.load_state_dict(self.status["optimizer_state"]) self.txt_logger.info("Optimizer loaded\n") def learn(self, total_timesteps, log_interval=1, save_interval=10, save_env_info=False, save_loc=None): """ The primary training loop. :param total_timesteps: the total number of timesteps :param log_interval: the period between logging/printing updates :param save_interval: the number of updates between model saving :param save_env_info: if we save the environment info (termination set) VERY SLOW :return: True, if training is successful """ self.init_training_algo( ) # initialize the training algo/environment list/optimizer if save_loc: print( 'ignoring save_loc override. if this is not intended, fix me') # initialize parameters self.num_frames = self.status["num_frames"] self.update = self.status["update"] start_time = time.time() # loop until we reach the desired number of timesteps while self.num_frames < total_timesteps: # Update model parameters update_start_time = time.time( ) # store the time (for fps calculations) exps, logs1 = self.algo.collect_experiences( ) # collect a number of data points for training logs2 = self.algo.update_parameters( exps) # update the parameters based on the experiences logs = {**logs1, **logs2} # merge the logs for printing update_end_time = time.time() self.num_frames += logs["num_frames"] self.update += 1 # all of this messy stuff is just storing and printing the log info if self.update % log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize( logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [self.update, self.num_frames, fps, duration] header += [ "rreturn_" + key for key in rreturn_per_episode.keys() ] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] self.txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:usmM {:.2f} {:.2f} {:.2f} {:.2f} | F:usmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | D {:.3f}" .format(*data)) header += [ "return_" + key for key in return_per_episode.keys() ] data += return_per_episode.values() if self.status["num_frames"] == 0: self.csv_logger.writerow(header) self.csv_logger.writerow(data) self.csv_file.flush() for field, value in zip(header, data): self.tb_writer.add_scalar(field, value, self.num_frames) # Save status if save_interval > 0 and self.update % save_interval == 0: self._save_training_info() if save_env_info: for e in self.training_envs: if hasattr(e, 'save_env_info'): e.save_env_info() self._clear_training_envs() return True def _save_training_info(self): """ Function to save the training info. """ # update the status dictionary self.status = { "num_frames": self.num_frames, "update": self.update, "model_state": self.acmodel.state_dict(), "optimizer_state": self.algo.optimizer.state_dict() } if hasattr(self.preprocess_obss, "vocab"): # if we are using NLP save, NLP info self.status["vocab"] = self.preprocess_obss.vocab.vocab utils.save_status(self.status, self.model_dir) # save the status info to model_dir self.txt_logger.info("Status saved") def _clear_training_envs(self): """ Clear the training environments to free up memory. """ # the termination set gets lost, so we need to store it again if hasattr(self.env, 'termination_set'): self.env.termination_set = [ s for e in self.training_envs for s in e.termination_set ] # clear the env and the training envs self.algo.env = None self.training_envs = None def save(self, f): """ Legacy function for saving the model. TODO: place the saving logic for the model here :param f: """ print('self.save() - currently not implemented') def set_env(self, env): """ Set the environment and clear the training environments :param env: environment for training/acting """ # check to make sure the environment is the correct type assert isinstance(env, gym.Env) self.env = env self.training_envs = None def predict(self, obs, state=None, deterministic=False): """ Wrapper for training code compatibility. Calls get_action() to predict the action to take based on the current observation. :param obs: observation for predicting the action :param state: state of the LSTM (unused) :param deterministic: whether to use deterministic or probabilistic actions (unused) :return: action and LSTM state """ # assert (state==None) and (deterministic==False) # still need to reimplement return self.get_action( obs ), None # return action, states - states is unused at the moment def get_actions(self, obss): """ Get a list of actions for a list of observations. :param obss: list of observations for predicting actions :return: list of actions for the associated observations """ preprocessed_obss = self.preprocess_obss(obss, device=self.device) with torch.no_grad( ): # don't calculate the gradients, since we are doing a forward pass if self.acmodel.recurrent: # if we are using a recurrent model dist, _, self.memories = self.acmodel(preprocessed_obss, self.memories) else: # otherwise dist, _ = self.acmodel(preprocessed_obss) # preprocess the observations to put them in a torch-friendly format # the acmodel returns a probability distribution if self.argmax: # if we are detemrinistic, take the action with the highest probability actions = dist.probs.max(1, keepdim=True)[1] else: # otherwise sample the distribution to select the action actions = dist.sample() return actions.cpu().numpy() # reaturn a numpy array, not a tensor def get_action(self, obs): """ Wrapper for get_actions() to produce just a single action (rather than a list of actions) for acting. :param obs: single observation :return: single action """ return self.get_actions([obs])[0] def analyze_feedbacks(self, rewards, dones): """ rl-starter-files code. Not sure what this does. :param rewards: :param dones: """ if self.acmodel.recurrent: masks = 1 - torch.tensor( dones, dtype=torch.float, device=self.device).unsqueeze(1) self.memories *= masks def analyze_feedback(self, reward, done): """ rl-starter-files code. Not sure what this does (other than wrap analyze_feedbacks(). :param reward: :param done: :return: """ return self.analyze_feedbacks([reward], [done])
class Agent: """An agent. It is able: - to choose an action given an observation, - to analyze the feedback (i.e. reward and done state) of its action.""" def __init__(self, env, obs_space, action_space, model_dir, device=None, argmax=False, num_envs=1, use_memory=False, use_text=False): obs_space, self.preprocess_obs_goals = utils.get_obs_goals_preprocessor( obs_space) self.acmodel = ACModel(obs_space, action_space, use_memory=use_memory, use_text=use_text) self.device = device self.argmax = argmax self.num_envs = num_envs status = utils.get_status(model_dir) self.goals = list(status['agent_goals'].values()) # for goal in self.goals: # goal = env.unwrapped.get_obs_render( goal, tile_size=32) # plt.imshow(goal) # plt.show() if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size, device=self.device) self.acmodel.load_state_dict(status["model_state"]) self.acmodel.to(self.device) self.acmodel.eval() if hasattr(self.preprocess_obs_goals, "vocab"): self.preprocess_obs_goals.vocab.load_vocab(status["vocab"]) def concat_obs_goal(self, obs): if 'image' in obs: obs_goals = [{ "image": np.concatenate((obs["image"], self.goals[i]), axis=2), "mission": obs['mission'] } for i in range(len(self.goals))] else: obs_goals = [ np.concatenate((obs, self.goals[i]), axis=2) for i in range(len(self.goals)) ] return obs_goals def get_actions(self, obss): actions = np.zeros(len(obss), dtype=int) for i in range(len(obss)): memory = self.memories[i] obs_goals = self.concat_obs_goal(obss[i]) preprocessed_obs_goals = self.preprocess_obs_goals( obs_goals, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: memory = torch.stack([memory] * len(self.goals), 0) dists, values, memory = self.acmodel( preprocessed_obs_goals, memory) else: dists, values = self.acmodel(preprocessed_obs_goals) g = values.data.max(0)[1] print(values.data, g) if self.argmax: actions[i] = dists.probs.max(1, keepdim=True)[1][g].cpu().numpy() else: actions[i] = dists.sample()[g].cpu().numpy() if self.acmodel.recurrent: self.memories[i] = memory[g] return actions def reset(self): if self.acmodel.recurrent: self.memories = torch.zeros(self.num_envs, self.acmodel.memory_size, device=self.device) def get_action(self, obs): return self.get_actions([obs])[0] def analyze_feedbacks(self, rewards, dones): if self.acmodel.recurrent: masks = 1 - torch.tensor( dones, dtype=torch.float, device=self.device).unsqueeze(1) self.memories *= masks def analyze_feedback(self, reward, done): return self.analyze_feedbacks([reward], [done])
except: txt_logger.info("Failed to load pretrained model.\n") exit(1) # Load observations preprocessor using_gnn = (args.gnn != "GRU" and args.gnn != "LSTM") obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0], using_gnn, progression_mode) if "vocab" in status and preprocess_obss.vocab is not None: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded.\n") # Load model if use_mem: acmodel = RecurrentACModel(envs[0].env, obs_space, envs[0].action_space, args.ignoreLTL, args.gnn, args.dumb_ac, args.freeze_ltl) else: acmodel = ACModel(envs[0].env, obs_space, envs[0].action_space, args.ignoreLTL, args.gnn, args.dumb_ac, args.freeze_ltl) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) txt_logger.info("Loading model from existing run.\n") elif args.pretrained_gnn: acmodel.load_pretrained_gnn(pretrained_status["model_state"]) txt_logger.info("Pretrained model loaded.\n") acmodel.to(device) txt_logger.info("Model loaded.\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
# Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model if args.model == "ACMLP": acmodel = ACMLPModel(obs_space, envs[0].action_space) elif args.model == "ACNAC": acmodel = ACNACModel(obs_space, envs[0].action_space) else: acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text, args.use_nac) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) acmodel.eval() txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo":
else: status = utils.get_status(model_dir) txt_logger.info("Training status loaded\n") except OSError: status = {"num_frames": 0, "update": 0} # Load observations preprocessor obs_space, preprocess_obs_goals = utils.get_obs_goals_preprocessor(envs[0].observation_space) if "vocab" in status: preprocess_obs_goals.vocab.load_vocab(status["vocab"]) txt_logger.info("observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) txt_logger.info("Model loaded\n") acmodel.to(device) txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = a2c.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obs_goals) elif args.algo == "ppo": algo = ppo.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
# Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model try: acmodel = utils.load_model(model_dir) logger.info("Model successfully loaded\n") except OSError: if args.model_type == 'standard': acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text, args.prev_action, args.manual_memory, args.manual_memory_size) elif args.model_type == 'aux': acmodel = ACAuxModel(obs_space, envs[0].action_space, args.mem, args.text, args.prev_action, args.manual_memory, args.manual_memory_size, args.aux_context) elif args.model_type == 'aux_empower': acmodel = ACAuxEmpowerModel(obs_space, envs[0].action_space, args.mem, args.text, args.prev_action, args.manual_memory, args.manual_memory_size, args.aux_context) logger.info("Model successfully created\n") logger.info("{}\n".format(acmodel)) if torch.cuda.is_available(): acmodel.cuda()