def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment print(f"Your training in the {ENV_NAME} environment.\n") env = get_env_from_name(ENV_NAME, ENV_SEED) test_env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim) # Load model if retraining is selected if TRAIN_PARAMS["continue_training"]: # Create retrain path retrain_model_folder = TRAIN_PARAMS["continue_model_folder"] retrain_model_path = os.path.abspath( os.path.join(log_dir, "../../" + TRAIN_PARAMS["continue_model_folder"])) # Check if retrain model exists if not throw error if not os.path.exists(retrain_model_path): print( "Shutting down training since the model you specified in the " f"`continue_model_folder` `{retrain_model_folder}` " f"argument was not found for the `{ENV_NAME}` environment.") sys.exit(0) # Load retrain model print(f"Restoring model `{retrain_model_path}`") result = policy.restore(os.path.abspath(retrain_model_path + "/policy")) if not result: print( "Shuting down training as something went wrong while loading " f"model `{retrain_model_folder}`.") sys.exit(0) # Create new storage folder log_dir_split = log_dir.split("/") log_dir_split[-2] = ( "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) + "_finetune" # + "_retrained_" # + log_dir_split[-2] ) log_dir = "/".join(log_dir_split) # Reset lagrance multipliers if requested if ALG_PARAMS["reset_lagrance_multipliers"]: policy.sess.run( policy.log_alpha.assign(tf.math.log(ALG_PARAMS["alpha"]))) policy.sess.run( policy.log_labda.assign(tf.math.log(ALG_PARAMS["labda"]))) else: print(f"Train new model `{log_dir}`") # Print logging folder print(f"Logging results to `{log_dir}`.") # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "lyapunov_error": [], "alpha": [], "lambda": [], "entropy": [], "a_loss": [], } # Break out of loop if global steps have been reached if global_step > ENV_PARAMS["max_global_steps"]: # Print step count, save model and stop the program print(f"Training stopped after {global_step} steps.") print("Running time: ", time.time() - t1) print("Saving Model") policy.save_result(log_dir) print("Running time: ", time.time() - t1) return # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Save intermediate checkpoints if requested if TRAIN_PARAMS["save_checkpoints"]: if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0 and global_step != 0): # Create intermediate result checkpoint folder checkpoint_save_path = os.path.abspath( os.path.join(log_dir, "checkpoints", "step_" + str(j))) os.makedirs(checkpoint_save_path, exist_ok=True) # Save intermediate checkpoint policy.save_result(checkpoint_save_path) # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize weights and parameters using STG if (pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if (training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation( test_env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Decay learning rates frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim, log_dir=log_dir) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 tb_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Create tensorboard variables tb_lr_a = tf.Variable(lr_a, dtype=tf.float32) tb_lr_l = tf.Variable(lr_l, dtype=tf.float32) tb_lr_lag = tf.Variable(lr_a, dtype=tf.float32) tb_ret = tf.Variable(0, dtype=tf.float32) tb_len = tf.Variable(0, dtype=tf.float32) tb_a_loss = tf.Variable(0, dtype=tf.float32) tb_lyapunov_error = tf.Variable(0, dtype=tf.float32) tb_entropy = tf.Variable(0, dtype=tf.float32) # Initialize tensorboard variables and create summaries if USE_TB: policy.sess.run( [ tb_lr_a.initializer, tb_lr_l.initializer, tb_lr_lag.initializer, tb_ret.initializer, tb_len.initializer, tb_a_loss.initializer, tb_lyapunov_error.initializer, tb_entropy.initializer, ] ) # Add tensorboard summaries main_sum = tf.compat.v1.summary.merge( [ tf.compat.v1.summary.scalar("lr_a", tb_lr_a), tf.compat.v1.summary.scalar("lr_l", tb_lr_l), tf.compat.v1.summary.scalar("lr_lag", tb_lr_lag), tf.compat.v1.summary.scalar("alpha", policy.alpha), tf.compat.v1.summary.scalar("lambda", policy.labda), ] ) other_sum = tf.compat.v1.summary.merge( [ tf.compat.v1.summary.scalar("ep_ret", tb_ret), tf.compat.v1.summary.scalar("ep_length", tb_len), tf.compat.v1.summary.scalar("a_loss", tb_a_loss), tf.compat.v1.summary.scalar("lyapunov_error", tb_lyapunov_error), tf.compat.v1.summary.scalar("entropy", tb_entropy), ] ) policy.tb_writer.add_summary( policy.sess.run(main_sum), policy.sess.run(policy.step) ) if WRITE_W_B: policy.tb_writer.add_summary( policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step), ) policy.tb_writer.flush() # Above summaries are known from the start # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2))) # DEBUG action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Increment tensorboard step counter # NOTE: This was done differently from the global_step counter since # otherwise there were inconsistencies in the tb log. if USE_TB: tb_step += 1 # Optimize weights and parameters using STG if ( pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0 ): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch ) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if ( training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0 ): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts(last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"] ) for key in eval_diagnostics.keys() ] [ string_to_print.extend( [key, ":", str(round(training_diagnostics[key], 2)), "|"] ) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Get current model performance for tb if USE_TB: training_diagnostics = evaluate_training_rollouts( last_training_paths ) # Log tb variables if USE_TB: if i % TB_FREQ == 0: # Update and log learning rate tb vars policy.sess.run(policy.step.assign(tb_step)) policy.sess.run(tb_lr_a.assign(lr_a_now)) policy.sess.run(tb_lr_l.assign(lr_l_now)) policy.sess.run(tb_lr_lag.assign(lr_a)) policy.tb_writer.add_summary( policy.sess.run(main_sum), policy.sess.run(policy.step) ) # Update and log other training vars to tensorboard if training_started: # Update and log training vars policy.sess.run( tb_ret.assign(training_diagnostics["return"]) ) policy.sess.run( tb_len.assign(training_diagnostics["length"]) ) policy.sess.run( tb_a_loss.assign(training_diagnostics["a_loss"]) ) policy.sess.run( tb_lyapunov_error.assign( training_diagnostics["lyapunov_error"] ) ) policy.sess.run( tb_entropy.assign(training_diagnostics["entropy"]) ) policy.tb_writer.add_summary( policy.sess.run(other_sum), policy.sess.run(policy.step) ) # Log network weights if WRITE_W_B: policy.tb_writer.add_summary( policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step), ) policy.tb_writer.flush() # Decay learning rates frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) # policy.tb_writer.close() print("Running time: ", time.time() - t1) return
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim, log_dir=log_dir) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Log initial values to tensorboard if DEBUG_PARAMS["use_tb"]: # Trace learn method (Used for debugging) if DEBUG_PARAMS["debug"]: if DEBUG_PARAMS["trace_net"]: # Create dummy input batch = { "s": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)), "a": tf.random.uniform((ALG_PARAMS["batch_size"], policy.a_dim)), "r": tf.random.uniform((ALG_PARAMS["batch_size"], 1)), "terminal": tf.zeros((ALG_PARAMS["batch_size"], 1)), "s_": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)), } # Trace learn method and log to tensorboard tf.summary.trace_on(graph=True, profiler=True) policy.learn(lr_a_now, lr_l_now, lr_a, batch) with policy.tb_writer.as_default(): tf.summary.trace_export( name="learn", step=0, profiler_outdir=log_dir ) # Shut down as we are in debug mode if DEBUG_PARAMS["trace_net"] or DEBUG_PARAMS["trace_learn"]: print( "Shutting down training as a trace was requested in debug mode. " "This was done since during the trace a backward pass was performed " "on dummy data. Please disable the trace to continue training " "while being in debug mode." ) sys.exit(0) # Log initial values with policy.tb_writer.as_default(): tf.summary.scalar("lr_a", lr_a_now, step=0) tf.summary.scalar("lr_l", lr_l_now, step=0) tf.summary.scalar("lr_lag", lr_a, step=0) tf.summary.scalar("alpha", policy.alpha, step=0) tf.summary.scalar("lambda", policy.labda, step=0) # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2))) # DEBUG action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Increment tensorboard step counter # NOTE: This was done differently from the global_step counter since # otherwise there were inconsistencies in the tb log. if DEBUG_PARAMS["use_tb"]: policy.step += 1 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize weights and parameters using STG if ( pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0 ): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch ) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if ( training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0 ): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts(last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"] ) for key in eval_diagnostics.keys() ] [ string_to_print.extend( [key, ":", str(round(training_diagnostics[key], 2)), "|"] ) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Get current model performance for tb if DEBUG_PARAMS["use_tb"]: training_diagnostics = evaluate_training_rollouts( last_training_paths ) # Log tb variables if DEBUG_PARAMS["use_tb"]: if i % DEBUG_PARAMS["tb_freq"] == 0: # Log learning rate to tb with policy.tb_writer.as_default(): tf.summary.scalar("lr_a", lr_a_now, step=policy.step) tf.summary.scalar("lr_l", lr_l_now, step=policy.step) tf.summary.scalar("lr_lag", lr_a, step=policy.step) tf.summary.scalar("alpha", policy.alpha, step=policy.step) tf.summary.scalar("lambda", policy.labda, step=policy.step) # Update and log other training vars to tensorboard if training_started: with policy.tb_writer.as_default(): tf.summary.scalar( "ep_ret", training_diagnostics["return"], step=policy.step, ) tf.summary.scalar( "ep_length", training_diagnostics["length"], step=policy.step, ) tf.summary.scalar( "a_loss", training_diagnostics["a_loss"], step=policy.step, ) tf.summary.scalar( "lyapunov_error", training_diagnostics["lyapunov_error"], step=policy.step, ) tf.summary.scalar( "entropy", training_diagnostics["entropy"], step=policy.step, ) # Log network weights if DEBUG_PARAMS["write_w_b"]: with policy.tb_writer.as_default(): # GaussianActor weights/biases tf.summary.histogram( "Ga/l1/weights", policy.ga.net_0.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/l1/bias", policy.ga.net_0.weights[1], step=policy.step, ) tf.summary.histogram( "Ga/l2/weights", policy.ga.net_1.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/l2/bias", policy.ga.net_1.weights[1], step=policy.step, ) tf.summary.histogram( "Ga/mu/weights", policy.ga.mu.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/mu/bias", policy.ga.mu.weights[1], step=policy.step, ) tf.summary.histogram( "Ga/log_sigma/weights", policy.ga.log_sigma.weights[0], step=policy.step, ) tf.summary.histogram( "Ga/log_sigma/bias", policy.ga.log_sigma.weights[1], step=policy.step, ) # Target GaussianActor weights/biases tf.summary.histogram( "Ga_/l1/weights", policy.ga_.net_0.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/l1/bias", policy.ga_.net_0.weights[1], step=policy.step, ) tf.summary.histogram( "Ga_/l2/weights", policy.ga_.net_1.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/l2/bias", policy.ga_.net_1.weights[1], step=policy.step, ) tf.summary.histogram( "Ga_/mu/weights", policy.ga_.mu.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/mu/bias", policy.ga_.mu.weights[1], step=policy.step, ) tf.summary.histogram( "Ga_/log_sigma/weights", policy.ga_.log_sigma.weights[0], step=policy.step, ) tf.summary.histogram( "Ga_/log_sigma/bias", policy.ga_.log_sigma.weights[1], step=policy.step, ) # Lyapunov critic weights/biases tf.summary.histogram( "Lc/w1_s", policy.lc.w1_s, step=policy.step, ) tf.summary.histogram( "Lc/w1_a", policy.lc.w1_a, step=policy.step, ) tf.summary.histogram( "Lc/b1", policy.lc.b1, step=policy.step, ) tf.summary.histogram( "Lc/net/l2/weights", policy.lc.net.layers[0].weights[0], step=policy.step, ) tf.summary.histogram( "Lc/net/l2/bias", policy.lc.net.layers[0].weights[1], step=policy.step, ) # Target Lyapunov critic weights/biases tf.summary.histogram( "Lc_/w1_s", policy.lc_.w1_s, step=policy.step, ) tf.summary.histogram( "Lc_/w1_a", policy.lc_.w1_a, step=policy.step, ) tf.summary.histogram( "Lc_/b1", policy.lc_.b1, step=policy.step, ) tf.summary.histogram( "Lc_/net/l2/weights", policy.lc_.net.layers[0].weights[0], step=policy.step, ) tf.summary.histogram( "Lc_/net/l2/bias", policy.lc_.net.layers[0].weights[1], step=policy.step, ) # Decay learning rates frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) print("Running time: ", time.time() - t1) return
def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) if variant['evaluate'] is True: evaluation_env = get_env_from_name(env_name) else: evaluation_env = None env_params = variant['env_params'] judge_safety_func = get_safety_constraint_func(variant) max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] num_of_paths = variant['num_of_paths'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] noise_scale = policy_params['noise'] noise_scale_now = noise_scale lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) logger.logkv('target_entropy', policy.target_entropy) # For analyse Render = env_params['eval_render'] ewma_p = 0.95 ewma_step = np.zeros((1, max_episodes + 1)) ewma_reward = np.zeros((1, max_episodes + 1)) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False for i in range(max_episodes): ep_reward = 0 l_r = 0 current_path = { 'rewards': [], 'l_rewards': [], 'l_error': [], 'critic1_error': [], 'critic2_error': [], 'alpha': [], 'lambda': [], 'entropy': [], 'a_loss': [], } if global_step > max_global_steps: break s = env.reset() for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, noise) a = np.clip(a, -np.ones(a_dim), np.ones(a_dim)) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator s_, r, done, info = env.step(action) l_r = info['l_rewards'] if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. # 储存s,a和s_next,reward用于DDPG的学习 policy.store_transition(s, a, r, l_r, terminal, s_) # 如果状态接近边缘 就存储到边缘memory里 # if policy.use_lyapunov is True and np.abs(s[0]) > env.cons_pos: # or np.abs(s[2]) > env.theta_threshold_radians*0.8 if policy.use_lyapunov is True and judge_safety_func( s_, r, done, info): # or np.abs(s[2]) > env.theta_threshold_radians*0.8 policy.store_edge_transition(s, a, r, l_r, terminal, s_) # Learn if policy.use_lyapunov is True: if policy.pointer > min_memory_size and policy.cons_pointer > 0: # Decay the action randomness training_started = True labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now) global_step += 1 else: if policy.pointer > min_memory_size: # Decay the action randomness training_started = True labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now) global_step += 1 if training_started: current_path['rewards'].append(r) current_path['l_rewards'].append(l_r) current_path['l_error'].append(l_loss) current_path['critic1_error'].append(c1_loss) current_path['critic2_error'].append(c2_loss) current_path['alpha'].append(alpha) current_path['lambda'].append(labda) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) # if global_step>204800: # Render=True if training_started and global_step % evaluation_frequency == 0 and global_step > 0: if evaluation_env is not None: rollouts = get_evaluation_rollouts(policy, evaluation_env, num_of_paths, max_ep_steps, render=Render) diagnotic = evaluate_rollouts(rollouts) # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()] print( 'training_step:', global_step, 'average reward:', diagnotic['return-average'], 'average length:', diagnotic['episode-length-avg'], ) logger.logkv('eval_eprewmean', diagnotic['return-average']) logger.logkv('eval_eprewmin', diagnotic['return-min']) logger.logkv('eval_eprewmax', diagnotic['return-max']) logger.logkv('eval_eplrewmean', diagnotic['lreturn-average']) logger.logkv('eval_eplrewmin', diagnotic['lreturn-min']) logger.logkv('eval_eplrewmax', diagnotic['lreturn-max']) logger.logkv('eval_eplenmean', diagnotic['episode-length-avg']) logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\ logger.logkv('eprewmean', training_diagnotic['train-return-average']) logger.logkv('eplrewmean', training_diagnotic['train-lreturn-average']) logger.logkv( 'eplenmean', training_diagnotic['train-episode-length-avg']) logger.logkv('lyapunov_lambda', training_diagnotic['train-lambda-avg']) logger.logkv('entropy', training_diagnotic['train-entropy-avg']) logger.logkv('critic1 error', training_diagnotic['train-critic1-error-avg']) logger.logkv('critic2 error', training_diagnotic['train-critic2-error-avg']) logger.logkv( 'lyapunov error', training_diagnotic['train-lyapunov-error-avg']) logger.logkv('policy_loss', training_diagnotic['train-a-loss-avg']) logger.logkv('noise_scale', noise_scale_now) logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) print( 'training_step:', global_step, 'average reward:', round(training_diagnotic['train-return-average'], 2), 'average lreward:', round(training_diagnotic['train-lreturn-average'], 2), 'average length:', round(training_diagnotic['train-episode-length-avg'], 1), 'lyapunov error:', round(training_diagnotic['train-lyapunov-error-avg'], 6), 'critic1 error:', round(training_diagnotic['train-critic1-error-avg'], 6), 'critic2 error:', round(training_diagnotic['train-critic2-error-avg'], 6), 'policy_loss:', round(training_diagnotic['train-a-loss-avg'], 6), 'alpha:', round(training_diagnotic['train-alpha-avg'], 6), 'lambda:', round(training_diagnotic['train-lambda-avg'], 6), 'entropy:', round(training_diagnotic['train-entropy-avg'], 6), 'noise_scale', round(noise_scale_now, 6), ) logger.dumpkvs() # 状态更新 s = s_ ep_reward += r # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) ewma_step[0, i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j ewma_reward[ 0, i + 1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic noise_scale_now = noise_scale * frac break print('Running time: ', time.time() - t1) return
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize weights and parameters using STG if (pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if (training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) print("Running time: ", time.time() - t1) return
def train(variant): Min_cost = 1000000 data_trajectories = get_data() # get data (X, W, X_, theta, state) env_name = variant['env_name'] # choose your environment env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params[ 'max_episodes'] # maximum episodes for RL training max_ep_steps = env_params[ 'max_ep_steps'] # number of maximum steps in each episode max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] s_dim = env.observation_space.shape[ 0] # dimension of state (3 for Battery) a_dim = env.action_space.shape[0] # action space dimension (1 or 2) a_upperbound = env.action_space.high a_lowerbound = env.action_space.low agent = CAC(a_dim, s_dim, policy_params, max_global_steps=max_global_steps) # policy.restore(variant['log_path'] + "/0/policy") pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', agent.target_entropy) for i in range(max_episodes): print("episode # ", i) print("global steps ", global_step) current_path = { 'rewards': [], 'distance': [], 'a_loss': [], 'alpha': [], 'labda': [], 'beta': [], 'lyapunov_error': [], 'entropy': [], 'action_distance': [], } if global_step > max_global_steps: break s = env.reset() # Random start point # traj_id = np.random.randint(0, len(data_trajectories)) traj_id = np.random.randint(0, variant['num_data_trajectories']) # traj_id = 0 traj = data_trajectories[traj_id] # print(len(traj)) start_point = np.random.randint(0, len(traj)) # start_point = 0 s = traj[start_point, 1] # current state, theta,next w, desired state # this is for decision making # 16,1,4,16 s = np.array([s, traj[start_point, 2], traj[start_point, 4]]) # print(i, s) env.state = s env.model.state = traj[start_point, -8:] ep_steps = min(start_point + 1 + max_ep_steps, len(traj)) for j in range(start_point + 1, ep_steps): if Render: env.render() delta = np.zeros(s.shape) # ###### NOSIE ############## # noise = np.random.normal(0, 0.01, 0.01) # delta[2:]= noise # ########IF Noise env########## # s= s + delta # a = policy.choose_action(s) # ###### BIAS ############## # noise = s[0:16]*0.01 # delta[0:16] = noise a = agent.act(torch.tensor([s]).float()) action = a_lowerbound + (a.detach().numpy() + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Run in simulator _, r, done, X_ = env.step(action) # The new s= current state,next omega, next state s_ = np.array([X_[1][0], traj[j, 2], traj[j, 4]]) r = modify_reward(r, s, s_, variant['reward_id']) env.state = s_ # theta_pre=theta if training_started: global_step += 1 # agent.scheduler_step() if j == max_ep_steps - 1 + start_point: done = True terminal = 1. if done else 0. if j > start_point + 2: pool.store(s, a.detach().numpy().flatten(), np.zeros([1]), np.zeros([1]), r, terminal, s_, _s) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) alpha_loss, beta_loss, labda_loss, actor_loss, lyapunov_loss = agent.learn( batch) if global_step % 200 == 0: print("labda = ", agent.labda.item(), " | alpha = ", agent.alpha.item(), " | l_loss = ", lyapunov_loss.item(), " | constraint loss : ", agent.lyapunov_loss.item(), " | entropy = ", agent.log_pis.mean().item(), " | a_loss = ", actor_loss.item(), " | alpha_loss = ", alpha_loss.item(), " | labda_loss = ", labda_loss.item(), " | lr_a = ", agent.LR_A, " | lr_l = ", agent.LR_L, " | lr_labda = ", agent.LR_lag, " | log alpha grad = ", agent.log_alpha.grad.item(), " | log labda grad = ", agent.log_labda.grad.item(), " | predicted_l : ", agent.l.mean().item(), " | predicted_l_ : ", agent.l_.mean().item()) if training_started: current_path['rewards'].append(r) current_path['lyapunov_error'].append( lyapunov_loss.detach().numpy()) current_path['alpha'].append(agent.alpha.detach().numpy()) current_path['entropy'].append( agent.log_pis.mean().detach().cpu().numpy()) current_path['a_loss'].append(actor_loss.detach().numpy()) current_path['beta'].append(agent.beta.detach().numpy()) # current_path['action_distance'].append(action_distance) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) # print(training_diagnotic) if training_diagnotic is not None: print("doing training evaluation") eval_diagnotic = training_evaluation(variant, env, agent) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_actor_alpha', agent.LR_A) logger.logkv('lr_lyapunov', agent.LR_L) logger.logkv('lr_labda', agent.LR_lag) string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() if eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] <= Min_cost: Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] print("New lowest cost:", Min_cost) agent.save_result(log_path) else: print("cost did not improve.") print("The best cost is ", Min_cost) print( "avg cost was ", eval_diagnotic['test_return'] / eval_diagnotic['test_average_length']) if training_started and global_step % ( 10 * evaluation_frequency) == 0 and global_step > 0: agent.save_result(log_path) # State Update _s = s s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) break agent.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(log_dir): """Performs the agent training. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create train and test environments print( colorize( f"INFO: You are training in the {ENV_NAME} environment.", "cyan", bold=True, )) env = get_env_from_name(ENV_NAME, ENV_SEED) test_env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l, lr_c = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ALG_PARAMS["lr_c"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for Lyapunov critic lr_c_now = ALG_PARAMS["lr_c"] # learning rate for q critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_lowerbound = env.action_space.low a_upperbound = env.action_space.high # Create the Agent policy = LAC(a_dim, s_dim, act_limits={ "low": a_lowerbound, "high": a_upperbound }) # Load model if retraining is selected if TRAIN_PARAMS["continue_training"]: # Create retrain model path retrain_model_folder = TRAIN_PARAMS["continue_model_folder"] retrain_model_path = osp.abspath( osp.join(log_dir, "../..", TRAIN_PARAMS["continue_model_folder"])) # Check if retrain model exists if not throw error if not osp.exists(retrain_model_path): print( colorize( ("ERROR: Shutting down training since the model you specified " f"in the `continue_model_folder` `{retrain_model_folder}` " f"argument was not found for the `{ENV_NAME}` environment." ), "red", bold=True, )) sys.exit(0) # Load old model print( colorize(f"INFO: Restoring model `{retrain_model_path}`.", "cyan", bold=True)) result = policy.restore( osp.abspath(osp.join(retrain_model_path, "policy")), restore_lagrance_multipliers=( not ALG_PARAMS["reset_lagrance_multipliers"]), ) if not result: print( colorize( "ERROR: Shuting down training as something went wrong while " "loading " f"model `{retrain_model_folder}`.", "red", bold=True, )) sys.exit(0) # Create new storage folder log_dir_split = log_dir.split("/") log_dir_split[-2] = ( "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) + "_finetune") log_dir = "/".join(log_dir_split) else: print(colorize(f"INFO: Train new model `{log_dir}`", "cyan", bold=True)) # Print logging folder path print(colorize(f"INFO: Logging results to `{log_dir}`.", "cyan", bold=True)) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) #################################################### # Training loop #################################### #################################################### # Setup training loop parameters t1 = time.time() global_step = 0 global_episodes = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Train the agent in the environment until max_episodes has been reached print(colorize("INFO: Training...\n", "cyan", bold=True)) while 1: # Keep running episodes until global step has been reached # Create variable to store information about the current path if policy.use_lyapunov: current_path = { "rewards": [], "lyapunov_error": [], "alpha": [], "lambda": [], "entropy": [], "a_loss": [], "alpha_loss": [], "lambda_loss": [], } else: current_path = { "rewards": [], "critic_error": [], "alpha": [], "entropy": [], "a_loss": [], "alpha_loss": [], } # Reset environment s = env.reset() # Training Episode loop for jj in range(ENVS_PARAMS[ENV_NAME]["max_ep_steps"]): # Break out of loop if global steps have been reached if global_step >= TRAIN_PARAMS["max_global_steps"]: # Print step count, save model and stop the program print( colorize( f"\nINFO: Training stopped after {global_step} steps.", "cyan", bold=True, )) print( colorize( "INFO: Running time: {}".format(time.time() - t1), "cyan", bold=True, )) print(colorize("INFO: Saving Model", "cyan", bold=True)) policy.save_result(log_dir) return # Save intermediate checkpoints if requested if TRAIN_PARAMS["save_checkpoints"]: if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0 and global_step != 0): # Create intermediate result checkpoint folder checkpoint_save_path = osp.abspath( osp.join(log_dir, "checkpoints", "step_" + str(jj))) os.makedirs(checkpoint_save_path, exist_ok=True) # Save intermediate checkpoint policy.save_result(checkpoint_save_path) # Render environment if requested if ENVS_PARAMS[ENV_NAME]["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy # NOTE (rickstaa): The scaling operation is already performed inside the # policy based on the `act_limits` you supplied. a = policy.choose_action(s) # Perform action in env s_, r, done, _ = env.step(a) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if jj == ENVS_PARAMS[ENV_NAME]["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Optimize network weights and lagrance multipliers if (pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) if policy.use_lyapunov: ( labda, alpha, l_loss, entropy, a_loss, alpha_loss, labda_loss, ) = policy.learn(lr_a_now, lr_l_now, lr_a, lr_c_now, batch) else: alpha, loss_q, entropy, a_loss, alpha_loss = policy.learn( lr_a_now, lr_l_now, lr_a, lr_c_now, batch) # Store current path results if training_started: if policy.use_lyapunov: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) current_path["alpha_loss"].append(alpha_loss) current_path["lambda_loss"].append(labda_loss) else: current_path["rewards"].append(r) current_path["critic_error"].append(loss_q.numpy()) current_path["alpha"].append(alpha.numpy()) current_path["entropy"].append(entropy.numpy()) current_path["a_loss"].append(a_loss.numpy( )) # Improve: Check if this is the fastest way current_path["alpha_loss"].append(alpha_loss) # Evalute the current policy performance and log the results if (training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation( test_env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) if policy.use_lyapunov: logger.logkv("lr_l", lr_l_now) else: logger.logkv("lr_c", lr_c_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] prefix = (colorize("LAC|", "green") if ALG_PARAMS["use_lyapunov"] else colorize( "SAC|", "yellow")) print( colorize(prefix, "yellow", bold=True) + "".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Check if episode is done (continue to next episode) if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Decay learning rates frac = 1.0 - (global_step - 1.0) / TRAIN_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for Lyapunov critic lr_c_now = lr_c * frac # learning rate for q critic break # Continue to next episode # Increase episode counter global_episodes += 1