def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim, log_dir=log_dir) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 tb_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Create tensorboard variables tb_lr_a = tf.Variable(lr_a, dtype=tf.float32) tb_lr_l = tf.Variable(lr_l, dtype=tf.float32) tb_lr_lag = tf.Variable(lr_a, dtype=tf.float32) tb_ret = tf.Variable(0, dtype=tf.float32) tb_len = tf.Variable(0, dtype=tf.float32) tb_a_loss = tf.Variable(0, dtype=tf.float32) tb_lyapunov_error = tf.Variable(0, dtype=tf.float32) tb_entropy = tf.Variable(0, dtype=tf.float32) # Initialize tensorboard variables and create summaries if USE_TB: policy.sess.run( [ tb_lr_a.initializer, tb_lr_l.initializer, tb_lr_lag.initializer, tb_ret.initializer, tb_len.initializer, tb_a_loss.initializer, tb_lyapunov_error.initializer, tb_entropy.initializer, ] ) # Add tensorboard summaries main_sum = tf.compat.v1.summary.merge( [ tf.compat.v1.summary.scalar("lr_a", tb_lr_a), tf.compat.v1.summary.scalar("lr_l", tb_lr_l), tf.compat.v1.summary.scalar("lr_lag", tb_lr_lag), tf.compat.v1.summary.scalar("alpha", policy.alpha), tf.compat.v1.summary.scalar("lambda", policy.labda), ] ) other_sum = tf.compat.v1.summary.merge( [ tf.compat.v1.summary.scalar("ep_ret", tb_ret), tf.compat.v1.summary.scalar("ep_length", tb_len), tf.compat.v1.summary.scalar("a_loss", tb_a_loss), tf.compat.v1.summary.scalar("lyapunov_error", tb_lyapunov_error), tf.compat.v1.summary.scalar("entropy", tb_entropy), ] ) policy.tb_writer.add_summary( policy.sess.run(main_sum), policy.sess.run(policy.step) ) if WRITE_W_B: policy.tb_writer.add_summary( policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step), ) policy.tb_writer.flush() # Above summaries are known from the start # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2))) # DEBUG action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Increment tensorboard step counter # NOTE: This was done differently from the global_step counter since # otherwise there were inconsistencies in the tb log. if USE_TB: tb_step += 1 # Optimize weights and parameters using STG if ( pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0 ): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch ) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if ( training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0 ): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts(last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"] ) for key in eval_diagnostics.keys() ] [ string_to_print.extend( [key, ":", str(round(training_diagnostics[key], 2)), "|"] ) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Get current model performance for tb if USE_TB: training_diagnostics = evaluate_training_rollouts( last_training_paths ) # Log tb variables if USE_TB: if i % TB_FREQ == 0: # Update and log learning rate tb vars policy.sess.run(policy.step.assign(tb_step)) policy.sess.run(tb_lr_a.assign(lr_a_now)) policy.sess.run(tb_lr_l.assign(lr_l_now)) policy.sess.run(tb_lr_lag.assign(lr_a)) policy.tb_writer.add_summary( policy.sess.run(main_sum), policy.sess.run(policy.step) ) # Update and log other training vars to tensorboard if training_started: # Update and log training vars policy.sess.run( tb_ret.assign(training_diagnostics["return"]) ) policy.sess.run( tb_len.assign(training_diagnostics["length"]) ) policy.sess.run( tb_a_loss.assign(training_diagnostics["a_loss"]) ) policy.sess.run( tb_lyapunov_error.assign( training_diagnostics["lyapunov_error"] ) ) policy.sess.run( tb_entropy.assign(training_diagnostics["entropy"]) ) policy.tb_writer.add_summary( policy.sess.run(other_sum), policy.sess.run(policy.step) ) # Log network weights if WRITE_W_B: policy.tb_writer.add_summary( policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step), ) policy.tb_writer.flush() # Decay learning rates frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) # policy.tb_writer.close() print("Running time: ", time.time() - t1) return
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment print(f"Your training in the {ENV_NAME} environment.\n") env = get_env_from_name(ENV_NAME, ENV_SEED) test_env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim) # Load model if retraining is selected if TRAIN_PARAMS["continue_training"]: # Create retrain path retrain_model_folder = TRAIN_PARAMS["continue_model_folder"] retrain_model_path = os.path.abspath( os.path.join(log_dir, "../../" + TRAIN_PARAMS["continue_model_folder"])) # Check if retrain model exists if not throw error if not os.path.exists(retrain_model_path): print( "Shutting down training since the model you specified in the " f"`continue_model_folder` `{retrain_model_folder}` " f"argument was not found for the `{ENV_NAME}` environment.") sys.exit(0) # Load retrain model print(f"Restoring model `{retrain_model_path}`") result = policy.restore(os.path.abspath(retrain_model_path + "/policy")) if not result: print( "Shuting down training as something went wrong while loading " f"model `{retrain_model_folder}`.") sys.exit(0) # Create new storage folder log_dir_split = log_dir.split("/") log_dir_split[-2] = ( "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) + "_finetune" # + "_retrained_" # + log_dir_split[-2] ) log_dir = "/".join(log_dir_split) # Reset lagrance multipliers if requested if ALG_PARAMS["reset_lagrance_multipliers"]: policy.sess.run( policy.log_alpha.assign(tf.math.log(ALG_PARAMS["alpha"]))) policy.sess.run( policy.log_labda.assign(tf.math.log(ALG_PARAMS["labda"]))) else: print(f"Train new model `{log_dir}`") # Print logging folder print(f"Logging results to `{log_dir}`.") # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "lyapunov_error": [], "alpha": [], "lambda": [], "entropy": [], "a_loss": [], } # Break out of loop if global steps have been reached if global_step > ENV_PARAMS["max_global_steps"]: # Print step count, save model and stop the program print(f"Training stopped after {global_step} steps.") print("Running time: ", time.time() - t1) print("Saving Model") policy.save_result(log_dir) print("Running time: ", time.time() - t1) return # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Save intermediate checkpoints if requested if TRAIN_PARAMS["save_checkpoints"]: if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0 and global_step != 0): # Create intermediate result checkpoint folder checkpoint_save_path = os.path.abspath( os.path.join(log_dir, "checkpoints", "step_" + str(j))) os.makedirs(checkpoint_save_path, exist_ok=True) # Save intermediate checkpoint policy.save_result(checkpoint_save_path) # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize weights and parameters using STG if (pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if (training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation( test_env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Decay learning rates frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize weights and parameters using STG if (pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if (training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) print("Running time: ", time.time() - t1) return
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim, log_dir=log_dir) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Log initial values to tensorboard if DEBUG_PARAMS["use_tb"]: # Trace learn method (Used for debugging) if DEBUG_PARAMS["debug"]: if DEBUG_PARAMS["trace_net"]: # Create dummy input batch = { "s": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)), "a": tf.random.uniform((ALG_PARAMS["batch_size"], policy.a_dim)), "r": tf.random.uniform((ALG_PARAMS["batch_size"], 1)), "terminal": tf.zeros((ALG_PARAMS["batch_size"], 1)), "s_": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)), } # Trace learn method and log to tensorboard tf.summary.trace_on(graph=True, profiler=True) policy.learn(lr_a_now, lr_l_now, lr_a, batch) with policy.tb_writer.as_default(): tf.summary.trace_export( name="learn", step=0, profiler_outdir=log_dir ) # Shut down as we are in debug mode if DEBUG_PARAMS["trace_net"] or DEBUG_PARAMS["trace_learn"]: print( "Shutting down training as a trace was requested in debug mode. " "This was done since during the trace a backward pass was performed " "on dummy data. Please disable the trace to continue training " "while being in debug mode." ) sys.exit(0) # Log initial values with policy.tb_writer.as_default(): tf.summary.scalar("lr_a", lr_a_now, step=0) tf.summary.scalar("lr_l", lr_l_now, step=0) tf.summary.scalar("lr_lag", lr_a, step=0) tf.summary.scalar("alpha", policy.alpha, step=0) tf.summary.scalar("lambda", policy.labda, step=0) # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2))) # DEBUG action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Increment tensorboard step counter # NOTE: This was done differently from the global_step counter since # otherwise there were inconsistencies in the tb log. if DEBUG_PARAMS["use_tb"]: policy.step += 1 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize weights and parameters using STG if ( pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0 ): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch ) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if ( training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0 ): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts(last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"] ) for key in eval_diagnostics.keys() ] [ string_to_print.extend( [key, ":", str(round(training_diagnostics[key], 2)), "|"] ) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Get current model performance for tb if DEBUG_PARAMS["use_tb"]: training_diagnostics = evaluate_training_rollouts( last_training_paths ) # Log tb variables if DEBUG_PARAMS["use_tb"]: if i % DEBUG_PARAMS["tb_freq"] == 0: # Log learning rate to tb with policy.tb_writer.as_default(): tf.summary.scalar("lr_a", lr_a_now, step=policy.step) tf.summary.scalar("lr_l", lr_l_now, step=policy.step) tf.summary.scalar("lr_lag", lr_a, step=policy.step) tf.summary.scalar("alpha", policy.alpha, step=policy.step) tf.summary.scalar("lambda", policy.labda, step=policy.step) # Update and log other training vars to tensorboard if training_started: with policy.tb_writer.as_default(): tf.summary.scalar( "ep_ret", training_diagnostics["return"], step=policy.step, ) tf.summary.scalar( "ep_length", training_diagnostics["length"], step=policy.step, ) tf.summary.scalar( "a_loss", training_diagnostics["a_loss"], step=policy.step, ) tf.summary.scalar( "lyapunov_error", training_diagnostics["lyapunov_error"], step=policy.step, ) tf.summary.scalar( "entropy", training_diagnostics["entropy"], step=policy.step, ) # Log network weights if DEBUG_PARAMS["write_w_b"]: with policy.tb_writer.as_default(): # GaussianActor weights/biases tf.summary.histogram( "Ga/l1/weights", policy.ga.net_0.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/l1/bias", policy.ga.net_0.weights[1], step=policy.step, ) tf.summary.histogram( "Ga/l2/weights", policy.ga.net_1.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/l2/bias", policy.ga.net_1.weights[1], step=policy.step, ) tf.summary.histogram( "Ga/mu/weights", policy.ga.mu.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/mu/bias", policy.ga.mu.weights[1], step=policy.step, ) tf.summary.histogram( "Ga/log_sigma/weights", policy.ga.log_sigma.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/log_sigma/bias", policy.ga.log_sigma.weights[1], step=policy.step, ) # Target GaussianActor weights/biases tf.summary.histogram( "Ga_/l1/weights", policy.ga_.net_0.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/l1/bias", policy.ga_.net_0.weights[1], step=policy.step, ) tf.summary.histogram( "Ga_/l2/weights", policy.ga_.net_1.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/l2/bias", policy.ga_.net_1.weights[1], step=policy.step, ) tf.summary.histogram( "Ga_/mu/weights", policy.ga_.mu.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/mu/bias", policy.ga_.mu.weights[1], step=policy.step, ) tf.summary.histogram( "Ga_/log_sigma/weights", policy.ga_.log_sigma.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/log_sigma/bias", policy.ga_.log_sigma.weights[1], step=policy.step, ) # Lyapunov critic weights/biases tf.summary.histogram( "Lc/w1_s", policy.lc.w1_s, step=policy.step, ) tf.summary.histogram( "Lc/w1_a", policy.lc.w1_a, step=policy.step, ) tf.summary.histogram( "Lc/b1", policy.lc.b1, step=policy.step, ) tf.summary.histogram( "Lc/net/l2/weights", policy.lc.net.layers[0].weights[0], step=policy.step, ) tf.summary.histogram( "Lc/net/l2/bias", policy.lc.net.layers[0].weights[1], step=policy.step, ) # Target Lyapunov critic weights/biases tf.summary.histogram( "Lc_/w1_s", policy.lc_.w1_s, step=policy.step, ) tf.summary.histogram( "Lc_/w1_a", policy.lc_.w1_a, step=policy.step, ) tf.summary.histogram( "Lc_/b1", policy.lc_.b1, step=policy.step, ) tf.summary.histogram( "Lc_/net/l2/weights", policy.lc_.net.layers[0].weights[0], step=policy.step, ) tf.summary.histogram( "Lc_/net/l2/bias", policy.lc_.net.layers[0].weights[1], step=policy.step, ) # Decay learning rates frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) print("Running time: ", time.time() - t1) return
def train(log_dir): """Performs the agent training. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create train and test environments print( colorize( f"INFO: You are training in the {ENV_NAME} environment.", "cyan", bold=True, )) env = get_env_from_name(ENV_NAME, ENV_SEED) test_env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l, lr_c = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ALG_PARAMS["lr_c"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for Lyapunov critic lr_c_now = ALG_PARAMS["lr_c"] # learning rate for q critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_lowerbound = env.action_space.low a_upperbound = env.action_space.high # Create the Agent policy = LAC(a_dim, s_dim, act_limits={ "low": a_lowerbound, "high": a_upperbound }) # Load model if retraining is selected if TRAIN_PARAMS["continue_training"]: # Create retrain model path retrain_model_folder = TRAIN_PARAMS["continue_model_folder"] retrain_model_path = osp.abspath( osp.join(log_dir, "../..", TRAIN_PARAMS["continue_model_folder"])) # Check if retrain model exists if not throw error if not osp.exists(retrain_model_path): print( colorize( ("ERROR: Shutting down training since the model you specified " f"in the `continue_model_folder` `{retrain_model_folder}` " f"argument was not found for the `{ENV_NAME}` environment." ), "red", bold=True, )) sys.exit(0) # Load old model print( colorize(f"INFO: Restoring model `{retrain_model_path}`.", "cyan", bold=True)) result = policy.restore( osp.abspath(osp.join(retrain_model_path, "policy")), restore_lagrance_multipliers=( not ALG_PARAMS["reset_lagrance_multipliers"]), ) if not result: print( colorize( "ERROR: Shuting down training as something went wrong while " "loading " f"model `{retrain_model_folder}`.", "red", bold=True, )) sys.exit(0) # Create new storage folder log_dir_split = log_dir.split("/") log_dir_split[-2] = ( "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) + "_finetune") log_dir = "/".join(log_dir_split) else: print(colorize(f"INFO: Train new model `{log_dir}`", "cyan", bold=True)) # Print logging folder path print(colorize(f"INFO: Logging results to `{log_dir}`.", "cyan", bold=True)) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) #################################################### # Training loop #################################### #################################################### # Setup training loop parameters t1 = time.time() global_step = 0 global_episodes = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Train the agent in the environment until max_episodes has been reached print(colorize("INFO: Training...\n", "cyan", bold=True)) while 1: # Keep running episodes until global step has been reached # Create variable to store information about the current path if policy.use_lyapunov: current_path = { "rewards": [], "lyapunov_error": [], "alpha": [], "lambda": [], "entropy": [], "a_loss": [], "alpha_loss": [], "lambda_loss": [], } else: current_path = { "rewards": [], "critic_error": [], "alpha": [], "entropy": [], "a_loss": [], "alpha_loss": [], } # Reset environment s = env.reset() # Training Episode loop for jj in range(ENVS_PARAMS[ENV_NAME]["max_ep_steps"]): # Break out of loop if global steps have been reached if global_step >= TRAIN_PARAMS["max_global_steps"]: # Print step count, save model and stop the program print( colorize( f"\nINFO: Training stopped after {global_step} steps.", "cyan", bold=True, )) print( colorize( "INFO: Running time: {}".format(time.time() - t1), "cyan", bold=True, )) print(colorize("INFO: Saving Model", "cyan", bold=True)) policy.save_result(log_dir) return # Save intermediate checkpoints if requested if TRAIN_PARAMS["save_checkpoints"]: if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0 and global_step != 0): # Create intermediate result checkpoint folder checkpoint_save_path = osp.abspath( osp.join(log_dir, "checkpoints", "step_" + str(jj))) os.makedirs(checkpoint_save_path, exist_ok=True) # Save intermediate checkpoint policy.save_result(checkpoint_save_path) # Render environment if requested if ENVS_PARAMS[ENV_NAME]["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy # NOTE (rickstaa): The scaling operation is already performed inside the # policy based on the `act_limits` you supplied. a = policy.choose_action(s) # Perform action in env s_, r, done, _ = env.step(a) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if jj == ENVS_PARAMS[ENV_NAME]["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize network weights and lagrance multipliers if (pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) if policy.use_lyapunov: ( labda, alpha, l_loss, entropy, a_loss, alpha_loss, labda_loss, ) = policy.learn(lr_a_now, lr_l_now, lr_a, lr_c_now, batch) else: alpha, loss_q, entropy, a_loss, alpha_loss = policy.learn( lr_a_now, lr_l_now, lr_a, lr_c_now, batch) # Store current path results if training_started: if policy.use_lyapunov: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) current_path["alpha_loss"].append(alpha_loss) current_path["lambda_loss"].append(labda_loss) else: current_path["rewards"].append(r) current_path["critic_error"].append(loss_q.numpy()) current_path["alpha"].append(alpha.numpy()) current_path["entropy"].append(entropy.numpy()) current_path["a_loss"].append(a_loss.numpy( )) # Improve: Check if this is the fastest way current_path["alpha_loss"].append(alpha_loss) # Evalute the current policy performance and log the results if (training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation( test_env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) if policy.use_lyapunov: logger.logkv("lr_l", lr_l_now) else: logger.logkv("lr_c", lr_c_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] prefix = (colorize("LAC|", "green") if ALG_PARAMS["use_lyapunov"] else colorize( "SAC|", "yellow")) print( colorize(prefix, "yellow", bold=True) + "".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Check if episode is done (continue to next episode) if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Decay learning rates frac = 1.0 - (global_step - 1.0) / TRAIN_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for Lyapunov critic lr_c_now = lr_c * frac # learning rate for q critic break # Continue to next episode # Increase episode counter global_episodes += 1