def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim, log_dir=log_dir)

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    tb_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Create tensorboard variables
    tb_lr_a = tf.Variable(lr_a, dtype=tf.float32)
    tb_lr_l = tf.Variable(lr_l, dtype=tf.float32)
    tb_lr_lag = tf.Variable(lr_a, dtype=tf.float32)
    tb_ret = tf.Variable(0, dtype=tf.float32)
    tb_len = tf.Variable(0, dtype=tf.float32)
    tb_a_loss = tf.Variable(0, dtype=tf.float32)
    tb_lyapunov_error = tf.Variable(0, dtype=tf.float32)
    tb_entropy = tf.Variable(0, dtype=tf.float32)

    # Initialize tensorboard variables and create summaries
    if USE_TB:
        policy.sess.run(
            [
                tb_lr_a.initializer,
                tb_lr_l.initializer,
                tb_lr_lag.initializer,
                tb_ret.initializer,
                tb_len.initializer,
                tb_a_loss.initializer,
                tb_lyapunov_error.initializer,
                tb_entropy.initializer,
            ]
        )

        # Add tensorboard summaries
        main_sum = tf.compat.v1.summary.merge(
            [
                tf.compat.v1.summary.scalar("lr_a", tb_lr_a),
                tf.compat.v1.summary.scalar("lr_l", tb_lr_l),
                tf.compat.v1.summary.scalar("lr_lag", tb_lr_lag),
                tf.compat.v1.summary.scalar("alpha", policy.alpha),
                tf.compat.v1.summary.scalar("lambda", policy.labda),
            ]
        )
        other_sum = tf.compat.v1.summary.merge(
            [
                tf.compat.v1.summary.scalar("ep_ret", tb_ret),
                tf.compat.v1.summary.scalar("ep_length", tb_len),
                tf.compat.v1.summary.scalar("a_loss", tb_a_loss),
                tf.compat.v1.summary.scalar("lyapunov_error", tb_lyapunov_error),
                tf.compat.v1.summary.scalar("entropy", tb_entropy),
            ]
        )
        policy.tb_writer.add_summary(
            policy.sess.run(main_sum), policy.sess.run(policy.step)
        )
        if WRITE_W_B:
            policy.tb_writer.add_summary(
                policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step),
            )
        policy.tb_writer.flush()  # Above summaries are known from the start

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        # Stop training if max number of steps has been reached
        if global_step > ENV_PARAMS["max_global_steps"]:
            break

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2)))  # DEBUG
            action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Increment tensorboard step counter
            # NOTE: This was done differently from the global_step counter since
            # otherwise there were inconsistencies in the tb log.
            if USE_TB:
                tb_step += 1

            # Optimize weights and parameters using STG
            if (
                pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                and global_step % ALG_PARAMS["steps_per_cycle"] == 0
            ):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch
                    )

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (
                training_started
                and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                and global_step > 0
            ):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":", str(eval_diagnostics[key]), "|"]
                            )
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend(
                            [key, ":", str(round(training_diagnostics[key], 2)), "|"]
                        )
                        for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                    # Get current model performance for tb
                    if USE_TB:
                        training_diagnostics = evaluate_training_rollouts(
                            last_training_paths
                        )

                # Log tb variables
                if USE_TB:
                    if i % TB_FREQ == 0:

                        # Update and log learning rate tb vars
                        policy.sess.run(policy.step.assign(tb_step))
                        policy.sess.run(tb_lr_a.assign(lr_a_now))
                        policy.sess.run(tb_lr_l.assign(lr_l_now))
                        policy.sess.run(tb_lr_lag.assign(lr_a))
                        policy.tb_writer.add_summary(
                            policy.sess.run(main_sum), policy.sess.run(policy.step)
                        )

                        # Update and log other training vars to tensorboard
                        if training_started:

                            # Update and log training vars
                            policy.sess.run(
                                tb_ret.assign(training_diagnostics["return"])
                            )
                            policy.sess.run(
                                tb_len.assign(training_diagnostics["length"])
                            )
                            policy.sess.run(
                                tb_a_loss.assign(training_diagnostics["a_loss"])
                            )
                            policy.sess.run(
                                tb_lyapunov_error.assign(
                                    training_diagnostics["lyapunov_error"]
                                )
                            )
                            policy.sess.run(
                                tb_entropy.assign(training_diagnostics["entropy"])
                            )
                            policy.tb_writer.add_summary(
                                policy.sess.run(other_sum), policy.sess.run(policy.step)
                            )

                            # Log network weights
                            if WRITE_W_B:
                                policy.tb_writer.add_summary(
                                    policy.sess.run(policy.w_b_sum),
                                    policy.sess.run(policy.step),
                                )
                        policy.tb_writer.flush()

                # Decay learning rates
                frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break

    # Save model and print Running time
    policy.save_result(log_dir)
    # policy.tb_writer.close()
    print("Running time: ", time.time() - t1)
    return
Esempio n. 2
0
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    print(f"Your training in the {ENV_NAME} environment.\n")
    env = get_env_from_name(ENV_NAME, ENV_SEED)
    test_env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim)

    # Load model if retraining is selected
    if TRAIN_PARAMS["continue_training"]:

        # Create retrain path
        retrain_model_folder = TRAIN_PARAMS["continue_model_folder"]
        retrain_model_path = os.path.abspath(
            os.path.join(log_dir,
                         "../../" + TRAIN_PARAMS["continue_model_folder"]))

        # Check if retrain model exists if not throw error
        if not os.path.exists(retrain_model_path):
            print(
                "Shutting down training since the model you specified in the "
                f"`continue_model_folder` `{retrain_model_folder}` "
                f"argument was not found for the `{ENV_NAME}` environment.")
            sys.exit(0)

        # Load retrain model
        print(f"Restoring model `{retrain_model_path}`")
        result = policy.restore(os.path.abspath(retrain_model_path +
                                                "/policy"))
        if not result:
            print(
                "Shuting down training as something went wrong while loading "
                f"model `{retrain_model_folder}`.")
            sys.exit(0)

        # Create new storage folder
        log_dir_split = log_dir.split("/")
        log_dir_split[-2] = (
            "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) +
            "_finetune"
            # + "_retrained_"
            # + log_dir_split[-2]
        )
        log_dir = "/".join(log_dir_split)

        # Reset lagrance multipliers if requested
        if ALG_PARAMS["reset_lagrance_multipliers"]:
            policy.sess.run(
                policy.log_alpha.assign(tf.math.log(ALG_PARAMS["alpha"])))
            policy.sess.run(
                policy.log_labda.assign(tf.math.log(ALG_PARAMS["labda"])))
    else:
        print(f"Train new model `{log_dir}`")

    # Print logging folder
    print(f"Logging results to `{log_dir}`.")

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "lyapunov_error": [],
            "alpha": [],
            "lambda": [],
            "entropy": [],
            "a_loss": [],
        }

        # Break out of loop if global steps have been reached
        if global_step > ENV_PARAMS["max_global_steps"]:

            # Print step count, save model and stop the program
            print(f"Training stopped after {global_step} steps.")
            print("Running time: ", time.time() - t1)
            print("Saving Model")
            policy.save_result(log_dir)
            print("Running time: ", time.time() - t1)
            return

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Save intermediate checkpoints if requested
            if TRAIN_PARAMS["save_checkpoints"]:
                if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0
                        and global_step != 0):

                    # Create intermediate result checkpoint folder
                    checkpoint_save_path = os.path.abspath(
                        os.path.join(log_dir, "checkpoints", "step_" + str(j)))
                    os.makedirs(checkpoint_save_path, exist_ok=True)

                    # Save intermediate checkpoint
                    policy.save_result(checkpoint_save_path)

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.0) * (a_upperbound -
                                                 a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize weights and parameters using STG
            if (pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                    and global_step % ALG_PARAMS["steps_per_cycle"] == 0):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch)

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (training_started
                    and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                    and global_step > 0):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(
                            test_env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                # Decay learning rates
                frac = 1.0 - (global_step -
                              1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break
Esempio n. 3
0
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim)

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        # Stop training if max number of steps has been reached
        if global_step > ENV_PARAMS["max_global_steps"]:
            break

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.0) * (a_upperbound -
                                                 a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize weights and parameters using STG
            if (pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                    and global_step % ALG_PARAMS["steps_per_cycle"] == 0):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch)

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (training_started
                    and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                    and global_step > 0):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                frac = 1.0 - (global_step -
                              1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break

    # Save model and print Running time
    policy.save_result(log_dir)
    print("Running time: ", time.time() - t1)
    return
Esempio n. 4
0
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim, log_dir=log_dir)

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Log initial values to tensorboard
    if DEBUG_PARAMS["use_tb"]:

        # Trace learn method (Used for debugging)
        if DEBUG_PARAMS["debug"]:
            if DEBUG_PARAMS["trace_net"]:

                # Create dummy input
                batch = {
                    "s": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)),
                    "a": tf.random.uniform((ALG_PARAMS["batch_size"], policy.a_dim)),
                    "r": tf.random.uniform((ALG_PARAMS["batch_size"], 1)),
                    "terminal": tf.zeros((ALG_PARAMS["batch_size"], 1)),
                    "s_": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)),
                }

                # Trace learn method and log to tensorboard
                tf.summary.trace_on(graph=True, profiler=True)
                policy.learn(lr_a_now, lr_l_now, lr_a, batch)
                with policy.tb_writer.as_default():
                    tf.summary.trace_export(
                        name="learn", step=0, profiler_outdir=log_dir
                    )

            # Shut down as we are in debug mode
            if DEBUG_PARAMS["trace_net"] or DEBUG_PARAMS["trace_learn"]:
                print(
                    "Shutting down training as a trace was requested in debug mode. "
                    "This was done since during the trace a backward pass was performed "
                    "on dummy data. Please disable the trace to continue training "
                    "while being in debug mode."
                )
                sys.exit(0)

        # Log initial values
        with policy.tb_writer.as_default():
            tf.summary.scalar("lr_a", lr_a_now, step=0)
            tf.summary.scalar("lr_l", lr_l_now, step=0)
            tf.summary.scalar("lr_lag", lr_a, step=0)
            tf.summary.scalar("alpha", policy.alpha, step=0)
            tf.summary.scalar("lambda", policy.labda, step=0)

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        # Stop training if max number of steps has been reached
        if global_step > ENV_PARAMS["max_global_steps"]:
            break

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2)))  # DEBUG
            action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Increment tensorboard step counter
            # NOTE: This was done differently from the global_step counter since
            # otherwise there were inconsistencies in the tb log.
            if DEBUG_PARAMS["use_tb"]:
                policy.step += 1

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize weights and parameters using STG
            if (
                pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                and global_step % ALG_PARAMS["steps_per_cycle"] == 0
            ):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch
                    )

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (
                training_started
                and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                and global_step > 0
            ):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":", str(eval_diagnostics[key]), "|"]
                            )
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend(
                            [key, ":", str(round(training_diagnostics[key], 2)), "|"]
                        )
                        for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                    # Get current model performance for tb
                    if DEBUG_PARAMS["use_tb"]:
                        training_diagnostics = evaluate_training_rollouts(
                            last_training_paths
                        )

                # Log tb variables
                if DEBUG_PARAMS["use_tb"]:
                    if i % DEBUG_PARAMS["tb_freq"] == 0:

                        # Log learning rate to tb
                        with policy.tb_writer.as_default():
                            tf.summary.scalar("lr_a", lr_a_now, step=policy.step)
                            tf.summary.scalar("lr_l", lr_l_now, step=policy.step)
                            tf.summary.scalar("lr_lag", lr_a, step=policy.step)
                            tf.summary.scalar("alpha", policy.alpha, step=policy.step)
                            tf.summary.scalar("lambda", policy.labda, step=policy.step)

                        # Update and log other training vars to tensorboard
                        if training_started:
                            with policy.tb_writer.as_default():
                                tf.summary.scalar(
                                    "ep_ret",
                                    training_diagnostics["return"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "ep_length",
                                    training_diagnostics["length"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "a_loss",
                                    training_diagnostics["a_loss"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "lyapunov_error",
                                    training_diagnostics["lyapunov_error"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "entropy",
                                    training_diagnostics["entropy"],
                                    step=policy.step,
                                )

                            # Log network weights
                            if DEBUG_PARAMS["write_w_b"]:
                                with policy.tb_writer.as_default():

                                    # GaussianActor weights/biases
                                    tf.summary.histogram(
                                        "Ga/l1/weights",
                                        policy.ga.net_0.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/l1/bias",
                                        policy.ga.net_0.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/l2/weights",
                                        policy.ga.net_1.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/l2/bias",
                                        policy.ga.net_1.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/mu/weights",
                                        policy.ga.mu.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/mu/bias",
                                        policy.ga.mu.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/log_sigma/weights",
                                        policy.ga.log_sigma.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/log_sigma/bias",
                                        policy.ga.log_sigma.weights[1],
                                        step=policy.step,
                                    )

                                    # Target GaussianActor weights/biases
                                    tf.summary.histogram(
                                        "Ga_/l1/weights",
                                        policy.ga_.net_0.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/l1/bias",
                                        policy.ga_.net_0.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/l2/weights",
                                        policy.ga_.net_1.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/l2/bias",
                                        policy.ga_.net_1.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/mu/weights",
                                        policy.ga_.mu.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/mu/bias",
                                        policy.ga_.mu.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/log_sigma/weights",
                                        policy.ga_.log_sigma.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/log_sigma/bias",
                                        policy.ga_.log_sigma.weights[1],
                                        step=policy.step,
                                    )

                                    # Lyapunov critic weights/biases
                                    tf.summary.histogram(
                                        "Lc/w1_s", policy.lc.w1_s, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/w1_a", policy.lc.w1_a, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/b1", policy.lc.b1, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/net/l2/weights",
                                        policy.lc.net.layers[0].weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/net/l2/bias",
                                        policy.lc.net.layers[0].weights[1],
                                        step=policy.step,
                                    )

                                    # Target Lyapunov critic weights/biases
                                    tf.summary.histogram(
                                        "Lc_/w1_s", policy.lc_.w1_s, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/w1_a", policy.lc_.w1_a, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/b1", policy.lc_.b1, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/net/l2/weights",
                                        policy.lc_.net.layers[0].weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/net/l2/bias",
                                        policy.lc_.net.layers[0].weights[1],
                                        step=policy.step,
                                    )

                # Decay learning rates
                frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break

    # Save model and print Running time
    policy.save_result(log_dir)
    print("Running time: ", time.time() - t1)
    return
Esempio n. 5
0
def train(log_dir):
    """Performs the agent training.

    Args:
        log_dir (str): The directory in which the final model (policy) and the log data
            is saved.
    """

    # Create train and test environments
    print(
        colorize(
            f"INFO: You are training in the {ENV_NAME} environment.",
            "cyan",
            bold=True,
        ))
    env = get_env_from_name(ENV_NAME, ENV_SEED)
    test_env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l, lr_c = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
        ALG_PARAMS["lr_c"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for Lyapunov critic
    lr_c_now = ALG_PARAMS["lr_c"]  # learning rate for q critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_lowerbound = env.action_space.low
    a_upperbound = env.action_space.high

    # Create the Agent
    policy = LAC(a_dim,
                 s_dim,
                 act_limits={
                     "low": a_lowerbound,
                     "high": a_upperbound
                 })

    # Load model if retraining is selected
    if TRAIN_PARAMS["continue_training"]:

        # Create retrain model path
        retrain_model_folder = TRAIN_PARAMS["continue_model_folder"]
        retrain_model_path = osp.abspath(
            osp.join(log_dir, "../..", TRAIN_PARAMS["continue_model_folder"]))

        # Check if retrain model exists if not throw error
        if not osp.exists(retrain_model_path):
            print(
                colorize(
                    ("ERROR: Shutting down training since the model you specified "
                     f"in the `continue_model_folder` `{retrain_model_folder}` "
                     f"argument was not found for the `{ENV_NAME}` environment."
                     ),
                    "red",
                    bold=True,
                ))
            sys.exit(0)

        # Load old model
        print(
            colorize(f"INFO: Restoring model `{retrain_model_path}`.",
                     "cyan",
                     bold=True))
        result = policy.restore(
            osp.abspath(osp.join(retrain_model_path, "policy")),
            restore_lagrance_multipliers=(
                not ALG_PARAMS["reset_lagrance_multipliers"]),
        )
        if not result:
            print(
                colorize(
                    "ERROR: Shuting down training as something went wrong while "
                    "loading "
                    f"model `{retrain_model_folder}`.",
                    "red",
                    bold=True,
                ))
            sys.exit(0)

        # Create new storage folder
        log_dir_split = log_dir.split("/")
        log_dir_split[-2] = (
            "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) +
            "_finetune")
        log_dir = "/".join(log_dir_split)
    else:
        print(colorize(f"INFO: Train new model `{log_dir}`", "cyan",
                       bold=True))

    # Print logging folder path
    print(colorize(f"INFO: Logging results to `{log_dir}`.", "cyan",
                   bold=True))

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    ####################################################
    # Training loop ####################################
    ####################################################

    # Setup training loop parameters
    t1 = time.time()
    global_step = 0
    global_episodes = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Train the agent in the environment until max_episodes has been reached
    print(colorize("INFO: Training...\n", "cyan", bold=True))
    while 1:  # Keep running episodes until global step has been reached

        # Create variable to store information about the current path
        if policy.use_lyapunov:
            current_path = {
                "rewards": [],
                "lyapunov_error": [],
                "alpha": [],
                "lambda": [],
                "entropy": [],
                "a_loss": [],
                "alpha_loss": [],
                "lambda_loss": [],
            }
        else:
            current_path = {
                "rewards": [],
                "critic_error": [],
                "alpha": [],
                "entropy": [],
                "a_loss": [],
                "alpha_loss": [],
            }

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for jj in range(ENVS_PARAMS[ENV_NAME]["max_ep_steps"]):

            # Break out of loop if global steps have been reached
            if global_step >= TRAIN_PARAMS["max_global_steps"]:

                # Print step count, save model and stop the program
                print(
                    colorize(
                        f"\nINFO: Training stopped after {global_step} steps.",
                        "cyan",
                        bold=True,
                    ))
                print(
                    colorize(
                        "INFO: Running time: {}".format(time.time() - t1),
                        "cyan",
                        bold=True,
                    ))
                print(colorize("INFO: Saving Model", "cyan", bold=True))
                policy.save_result(log_dir)
                return

            # Save intermediate checkpoints if requested
            if TRAIN_PARAMS["save_checkpoints"]:
                if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0
                        and global_step != 0):

                    # Create intermediate result checkpoint folder
                    checkpoint_save_path = osp.abspath(
                        osp.join(log_dir, "checkpoints", "step_" + str(jj)))
                    os.makedirs(checkpoint_save_path, exist_ok=True)

                    # Save intermediate checkpoint
                    policy.save_result(checkpoint_save_path)

            # Render environment if requested
            if ENVS_PARAMS[ENV_NAME]["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            # NOTE (rickstaa): The scaling operation is already performed inside the
            # policy based on the `act_limits` you supplied.
            a = policy.choose_action(s)

            # Perform action in env
            s_, r, done, _ = env.step(a)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if jj == ENVS_PARAMS[ENV_NAME]["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize network weights and lagrance multipliers
            if (pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                    and global_step % ALG_PARAMS["steps_per_cycle"] == 0):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    if policy.use_lyapunov:
                        (
                            labda,
                            alpha,
                            l_loss,
                            entropy,
                            a_loss,
                            alpha_loss,
                            labda_loss,
                        ) = policy.learn(lr_a_now, lr_l_now, lr_a, lr_c_now,
                                         batch)
                    else:
                        alpha, loss_q, entropy, a_loss, alpha_loss = policy.learn(
                            lr_a_now, lr_l_now, lr_a, lr_c_now, batch)

            # Store current path results
            if training_started:
                if policy.use_lyapunov:
                    current_path["rewards"].append(r)
                    current_path["lyapunov_error"].append(l_loss)
                    current_path["alpha"].append(alpha)
                    current_path["lambda"].append(labda)
                    current_path["entropy"].append(entropy)
                    current_path["a_loss"].append(a_loss)
                    current_path["alpha_loss"].append(alpha_loss)
                    current_path["lambda_loss"].append(labda_loss)
                else:
                    current_path["rewards"].append(r)
                    current_path["critic_error"].append(loss_q.numpy())
                    current_path["alpha"].append(alpha.numpy())
                    current_path["entropy"].append(entropy.numpy())
                    current_path["a_loss"].append(a_loss.numpy(
                    ))  # Improve: Check if this is the fastest way
                    current_path["alpha_loss"].append(alpha_loss)

            # Evalute the current policy performance and log the results
            if (training_started
                    and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                    and global_step > 0):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(
                            test_env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    if policy.use_lyapunov:
                        logger.logkv("lr_l", lr_l_now)
                    else:
                        logger.logkv("lr_c", lr_c_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    prefix = (colorize("LAC|", "green")
                              if ALG_PARAMS["use_lyapunov"] else colorize(
                                  "SAC|", "yellow"))
                    print(
                        colorize(prefix, "yellow", bold=True) +
                        "".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Check if episode is done (continue to next episode)
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                # Decay learning rates
                frac = 1.0 - (global_step -
                              1.0) / TRAIN_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for Lyapunov critic
                lr_c_now = lr_c * frac  # learning rate for q critic
                break  # Continue to next episode

    # Increase episode counter
    global_episodes += 1