Exemple #1
0
def train(params,
          model_name,
          save_interval=10,
          eval_interval=10,
          record_eval=True,
          restart=False):
    # Traning parameters
    learning_rate = params["learning_rate"]
    lr_decay = params["lr_decay"]
    discount_factor = params["discount_factor"]
    gae_lambda = params["gae_lambda"]
    ppo_epsilon = params["ppo_epsilon"]
    value_scale = params["value_scale"]
    entropy_scale = params["entropy_scale"]
    horizon = params["horizon"]
    num_epochs = params["num_epochs"]
    num_episodes = params["num_episodes"]
    batch_size = params["batch_size"]
    vae_model = params["vae_model"]
    vae_model_type = params["vae_model_type"]
    vae_z_dim = params["vae_z_dim"]

    if vae_z_dim is None:
        vae_z_dim = params["vae_z_dim"] = int(
            re.findall("zdim(\d+)", vae_model)[0])
    if vae_model_type is None:
        vae_model_type = params[
            "vae_model_type"] = "mlp" if "mlp" in vae_model else "cnn"
    VAEClass = MlpVAE if vae_model_type == "mlp" else ConvVAE

    print("")
    print("Training parameters:")
    for k, v, in params.items():
        print(f"  {k}: {v}")
    print("")

    # Load pre-trained variational autoencoder
    vae = VAEClass(input_shape=(84, 84, 1),
                   z_dim=vae_z_dim,
                   models_dir="vae",
                   model_name=vae_model,
                   training=False)
    vae.init_session(init_logging=False)
    if not vae.load_latest_checkpoint():
        raise Exception("Failed to load VAE")

    # State encoding fn
    with_measurements = False
    stack = None
    encode_state_fn = create_encode_state_fn(
        vae, with_measurements=with_measurements, stack=stack)

    # Create env
    print("Creating environment")
    env = make_env(model_name, frame_skip=0, encode_state_fn=encode_state_fn)
    test_env = make_env(model_name + " (Test)",
                        encode_state_fn=encode_state_fn)

    # Environment constants
    input_shape = np.array([vae_z_dim])
    if with_measurements: input_shape[0] += 3
    if isinstance(stack, int): input_shape[0] *= stack
    num_actions = env.action_space.shape[0]
    action_min = env.action_space.low
    action_max = env.action_space.high

    # Create model
    print("Creating model")
    model = PPO(input_shape,
                num_actions,
                action_min,
                action_max,
                learning_rate=learning_rate,
                lr_decay=lr_decay,
                epsilon=ppo_epsilon,
                value_scale=value_scale,
                entropy_scale=entropy_scale,
                output_dir=os.path.join("models", model_name))

    # Prompt to load existing model if any
    if not restart:
        if os.path.isdir(model.log_dir) and len(os.listdir(model.log_dir)) > 0:
            answer = input(
                "Model \"{}\" already exists. Do you wish to continue (C) or restart training (R)? "
                .format(model_name))
            if answer.upper() == "C":
                model.load_latest_checkpoint()
            elif answer.upper() == "R":
                restart = True
            else:
                raise Exception(
                    "There are already log files for model \"{}\". Please delete it or change model_name and try again"
                    .format(model_name))
    if restart:
        shutil.rmtree(model.output_dir)
        for d in model.dirs:
            os.makedirs(d)
    model.init_logging()
    model.write_dict_to_summary("hyperparameters", params, 0)

    # For every episode
    while model.get_episode_idx() < num_episodes:
        episode_idx = model.get_episode_idx()

        # Save model periodically
        if episode_idx % save_interval == 0:
            model.save()

        # Run evaluation periodically
        if episode_idx % eval_interval == 0:
            video_filename = os.path.join(model.video_dir,
                                          "episode{}.avi".format(episode_idx))
            eval_reward, eval_score = test_agent(test_env,
                                                 model,
                                                 video_filename=video_filename)
            model.write_value_to_summary("eval/score", eval_score, episode_idx)
            model.write_value_to_summary("eval/reward", eval_reward,
                                         episode_idx)

        # Reset environment
        state, terminal_state, total_reward, total_value = env.reset(
        ), False, 0, 0

        # While episode not done
        print(f"Episode {episode_idx} (Step {model.get_train_step_idx()})")
        while not terminal_state:
            states, taken_actions, values, rewards, dones = [], [], [], [], []
            for _ in range(horizon):
                action, value = model.predict([state], write_to_summary=True)

                # Show value on-screen
                env.env.value_label.text = "V(s)={:.2f}".format(value)

                # Perform action
                new_state, reward, terminal_state, _ = env.step(action)
                env.render()
                total_reward += reward

                # Store state, action and reward
                states.append(state)  # [T, *input_shape]
                taken_actions.append(action)  # [T,  num_actions]
                values.append(value)  # [T]
                rewards.append(reward)  # [T]
                dones.append(terminal_state)  # [T]
                state = new_state

                if terminal_state:
                    break

            # Calculate last value (bootstrap value)
            _, last_values = model.predict([state])  # []

            # Compute GAE
            advantages = compute_gae(rewards, values, last_values, dones,
                                     discount_factor, gae_lambda)
            returns = advantages + values
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-8)

            # Flatten arrays
            states = np.array(states)
            taken_actions = np.array(taken_actions)
            returns = np.array(returns)
            advantages = np.array(advantages)

            T = len(rewards)
            assert states.shape == (T, *input_shape)
            assert taken_actions.shape == (T, num_actions)
            assert returns.shape == (T, )
            assert advantages.shape == (T, )

            # Train for some number of epochs
            model.update_old_policy()  # θ_old <- θ
            for _ in range(num_epochs):
                num_samples = len(states)
                indices = np.arange(num_samples)
                np.random.shuffle(indices)
                for i in range(int(np.ceil(num_samples / batch_size))):
                    # Sample mini-batch randomly
                    begin = i * batch_size
                    end = begin + batch_size
                    if end > num_samples:
                        end = None
                    mb_idx = indices[begin:end]

                    # Optimize network
                    model.train(states[mb_idx], taken_actions[mb_idx],
                                returns[mb_idx], advantages[mb_idx])

        # Write episodic values
        model.write_value_to_summary("train/score", env.env.reward,
                                     episode_idx)
        model.write_value_to_summary("train/reward", total_reward, episode_idx)
        model.write_value_to_summary("train/value", total_value, episode_idx)
        model.write_episodic_summaries()
Exemple #2
0
def train(params, start_carla=True, restart=False):
    # Read parameters
    learning_rate    = params["learning_rate"]
    lr_decay         = params["lr_decay"]
    discount_factor  = params["discount_factor"]
    gae_lambda       = params["gae_lambda"]
    ppo_epsilon      = params["ppo_epsilon"]
    initial_std      = params["initial_std"]
    value_scale      = params["value_scale"]
    entropy_scale    = params["entropy_scale"]
    horizon          = params["horizon"]
    num_epochs       = params["num_epochs"]
    num_episodes     = params["num_episodes"]
    batch_size       = params["batch_size"]
    vae_model        = params["vae_model"]
    vae_model_type   = params["vae_model_type"]
    vae_z_dim        = params["vae_z_dim"]
    synchronous      = params["synchronous"]
    fps              = params["fps"]
    action_smoothing = params["action_smoothing"]
    model_name       = params["model_name"]
    reward_fn        = params["reward_fn"]
    seed             = params["seed"]
    eval_interval    = params["eval_interval"]
    record_eval      = params["record_eval"]

    # Set seeds
    if isinstance(seed, int):
        tf.random.set_random_seed(seed)
        np.random.seed(seed)
        random.seed(0)

    # Load VAE
    vae = load_vae(vae_model, vae_z_dim, vae_model_type)
    
    # Override params for logging
    params["vae_z_dim"] = vae.z_dim
    params["vae_model_type"] = "mlp" if isinstance(vae, MlpVAE) else "cnn"

    print("")
    print("Training parameters:")
    for k, v, in params.items(): print(f"  {k}: {v}")
    print("")

    # Create state encoding fn
    measurements_to_include = set(["steer", "throttle", "speed"])
    encode_state_fn = create_encode_state_fn(vae, measurements_to_include)

    # Create env
    print("Creating environment")
    env = CarlaEnv(obs_res=(160, 80),
                   action_smoothing=action_smoothing,
                   encode_state_fn=encode_state_fn,
                   reward_fn=reward_functions[reward_fn],
                   synchronous=synchronous,
                   fps=fps,
                   start_carla=start_carla)
    if isinstance(seed, int):
        env.seed(seed)
    best_eval_reward = -float("inf")

    # Environment constants
    input_shape = np.array([vae.z_dim + len(measurements_to_include)])
    num_actions = env.action_space.shape[0]

    # Create model
    print("Creating model")
    model = PPO(input_shape, env.action_space,
                learning_rate=learning_rate, lr_decay=lr_decay,
                epsilon=ppo_epsilon, initial_std=initial_std,
                value_scale=value_scale, entropy_scale=entropy_scale,
                model_dir=os.path.join("models", model_name))

    # Prompt to load existing model if any
    if not restart:
        if os.path.isdir(model.log_dir) and len(os.listdir(model.log_dir)) > 0:
            answer = input("Model \"{}\" already exists. Do you wish to continue (C) or restart training (R)? ".format(model_name))
            if answer.upper() == "C":
                pass
            elif answer.upper() == "R":
                restart = True
            else:
                raise Exception("There are already log files for model \"{}\". Please delete it or change model_name and try again".format(model_name))
    
    if restart:
        shutil.rmtree(model.model_dir)
        for d in model.dirs:
            os.makedirs(d)
    model.init_session()
    if not restart:
        model.load_latest_checkpoint()
    model.write_dict_to_summary("hyperparameters", params, 0)

    # For every episode
    while num_episodes <= 0 or model.get_episode_idx() < num_episodes:
        episode_idx = model.get_episode_idx()
        
        # Run evaluation periodically
        if episode_idx % eval_interval == 0:
            video_filename = os.path.join(model.video_dir, "episode{}.avi".format(episode_idx))
            eval_reward = run_eval(env, model, video_filename=video_filename)
            model.write_value_to_summary("eval/reward", eval_reward, episode_idx)
            model.write_value_to_summary("eval/distance_traveled", env.distance_traveled, episode_idx)
            model.write_value_to_summary("eval/average_speed", 3.6 * env.speed_accum / env.step_count, episode_idx)
            model.write_value_to_summary("eval/center_lane_deviation", env.center_lane_deviation, episode_idx)
            model.write_value_to_summary("eval/average_center_lane_deviation", env.center_lane_deviation / env.step_count, episode_idx)
            model.write_value_to_summary("eval/distance_over_deviation", env.distance_traveled / env.center_lane_deviation, episode_idx)
            if eval_reward > best_eval_reward:
                model.save()
                best_eval_reward = eval_reward

        # Reset environment
        state, terminal_state, total_reward = env.reset(), False, 0
        
        # While episode not done
        print(f"Episode {episode_idx} (Step {model.get_train_step_idx()})")
        while not terminal_state:
            states, taken_actions, values, rewards, dones = [], [], [], [], []
            for _ in range(horizon):
                action, value = model.predict(state, write_to_summary=True)

                # Perform action
                new_state, reward, terminal_state, info = env.step(action)

                if info["closed"] == True:
                    exit(0)
                    
                env.extra_info.extend([
                    "Episode {}".format(episode_idx),
                    "Training...",
                    "",
                    "Value:  % 20.2f" % value
                ])

                env.render()
                total_reward += reward

                # Store state, action and reward
                states.append(state)         # [T, *input_shape]
                taken_actions.append(action) # [T,  num_actions]
                values.append(value)         # [T]
                rewards.append(reward)       # [T]
                dones.append(terminal_state) # [T]
                state = new_state

                if terminal_state:
                    break

            # Calculate last value (bootstrap value)
            _, last_values = model.predict(state) # []
            
            # Compute GAE
            advantages = compute_gae(rewards, values, last_values, dones, discount_factor, gae_lambda)
            returns = advantages + values
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

            # Flatten arrays
            states        = np.array(states)
            taken_actions = np.array(taken_actions)
            returns       = np.array(returns)
            advantages    = np.array(advantages)

            T = len(rewards)
            assert states.shape == (T, *input_shape)
            assert taken_actions.shape == (T, num_actions)
            assert returns.shape == (T,)
            assert advantages.shape == (T,)

            # Train for some number of epochs
            model.update_old_policy() # θ_old <- θ
            for _ in range(num_epochs):
                num_samples = len(states)
                indices = np.arange(num_samples)
                np.random.shuffle(indices)
                for i in range(int(np.ceil(num_samples / batch_size))):
                    # Sample mini-batch randomly
                    begin = i * batch_size
                    end   = begin + batch_size
                    if end > num_samples:
                        end = None
                    mb_idx = indices[begin:end]

                    # Optimize network
                    model.train(states[mb_idx], taken_actions[mb_idx],
                                returns[mb_idx], advantages[mb_idx])

        # Write episodic values
        model.write_value_to_summary("train/reward", total_reward, episode_idx)
        model.write_value_to_summary("train/distance_traveled", env.distance_traveled, episode_idx)
        model.write_value_to_summary("train/average_speed", 3.6 * env.speed_accum / env.step_count, episode_idx)
        model.write_value_to_summary("train/center_lane_deviation", env.center_lane_deviation, episode_idx)
        model.write_value_to_summary("train/average_center_lane_deviation", env.center_lane_deviation / env.step_count, episode_idx)
        model.write_value_to_summary("train/distance_over_deviation", env.distance_traveled / env.center_lane_deviation, episode_idx)
        model.write_episodic_summaries()