Beispiel #1
0
def train(params, model_name, save_interval=1000, eval_interval=200,
          record_episodes=True, restart=False):
    try:
        # Create test env
        print("[INFO] Creating test environment")
        test_env = gym.make(env_name)

        # Traning parameters
        initial_lr = params["initial_lr"]
        discount_factor = params["discount_factor"]
        gae_lambda = params["gae_lambda"]
        ppo_epsilon = params["ppo_epsilon"]
        value_scale = params["value_scale"]
        entropy_scale = params["entropy_scale"]
        horizon = params["horizon"]
        num_epochs = params["num_epochs"]
        batch_size = params["batch_size"]
        num_envs = params["num_envs"]

        # Training parameters
        def lr_scheduler(step_idx): return initial_lr * \
            0.85 ** (step_idx // 10000)

        # Environment constants
        frame_stack_size = 4
        input_shape = (84, 84, frame_stack_size)
        num_actions = test_env.action_space.shape[0]
        action_min = test_env.action_space.low
        action_max = test_env.action_space.high

        # Create model
        print("[INFO] Creating model")
        model = PPO(input_shape, num_actions, action_min, action_max,
                    epsilon=ppo_epsilon,
                    value_scale=value_scale, entropy_scale=entropy_scale,
                    model_name=model_name)

        print("[INFO] Creating environments")
        envs = SubprocVecEnv([make_env for _ in range(num_envs)])

        initial_frames = envs.reset()
        envs.get_images()
        frame_stacks = [FrameStack(initial_frames[i], stack_size=frame_stack_size,
                                   preprocess_fn=preprocess_frame) for i in range(num_envs)]

        print("[INFO] Training loop")
        while True:
            # While there are running environments
            states, taken_actions, values, rewards, dones = [], [], [], [], []

            # Simulate game for some number of steps
            for _ in range(horizon):
                # Predict and value action given state
                # π(a_t | s_t; θ_old)
                states_t = [frame_stacks[i].get_state()
                            for i in range(num_envs)]
                actions_t, values_t = model.predict(states_t)

                # Sample action from a Gaussian distribution
                envs.step_async(actions_t)
                frames, rewards_t, dones_t, _ = envs.step_wait()
                envs.get_images()  # render

                # Store state, action and reward
                # [T, N, 84, 84, 4]
                states.append(states_t)
                taken_actions.append(actions_t)              # [T, N, 3]
                values.append(np.squeeze(values_t, axis=-1))  # [T, N]
                rewards.append(rewards_t)                    # [T, N]
                dones.append(dones_t)                        # [T, N]

                # Get new state
                for i in range(num_envs):
                    # Reset environment's frame stack if done
                    if dones_t[i]:
                        for _ in range(frame_stack_size):
                            frame_stacks[i].add_frame(frames[i])
                    else:
                        frame_stacks[i].add_frame(frames[i])

            # Calculate last values (bootstrap values)
            states_last = [frame_stacks[i].get_state()
                           for i in range(num_envs)]
            last_values = np.squeeze(model.predict(
                states_last)[1], axis=-1)  # [N]

            advantages = compute_gae(
                rewards, values, last_values, dones, discount_factor, gae_lambda)
            advantages = (advantages - advantages.mean()) / \
                (advantages.std() + 1e-8)  # Move down one line?
            returns = advantages + values
            # Flatten arrays
            states = np.array(states).reshape(
                (-1, *input_shape))       # [T x N, 84, 84, 4]
            taken_actions = np.array(taken_actions).reshape(
                (-1, num_actions))  # [T x N, 3]
            # [T x N]
            returns = returns.flatten()
            # [T X N]
            advantages = advantages.flatten()

            T = len(rewards)
            N = num_envs
            assert states.shape == (
                T * N, input_shape[0], input_shape[1], frame_stack_size)
            assert taken_actions.shape == (T * N, num_actions)
            assert returns.shape == (T * N,)
            assert advantages.shape == (T * N,)

            # Train for some number of epochs
            model.update_old_policy()  # θ_old <- θ
            for _ in range(num_epochs):
                num_samples = len(states)
                indices = np.arange(num_samples)
                np.random.shuffle(indices)
                for i in range(int(np.ceil(num_samples / batch_size))):
                    # Evaluate model
                    if model.step_idx % eval_interval == 0:
                        print("[INFO] Running evaluation...")
                        avg_reward, value_error = evaluate(
                            model, test_env, discount_factor, frame_stack_size, make_video=True)
                        model.write_to_summary("eval_avg_reward", avg_reward)
                        model.write_to_summary("eval_value_error", value_error)

                    # Save model
                    if model.step_idx % save_interval == 0:
                        model.save()

                    # Sample mini-batch randomly
                    begin = i * batch_size
                    end = begin + batch_size
                    if end > num_samples:
                        end = None
                    mb_idx = indices[begin:end]

                    # Optimize network
                    model.train(states[mb_idx], taken_actions[mb_idx],
                                returns[mb_idx], advantages[mb_idx])
    except KeyboardInterrupt:
        model.save()
Beispiel #2
0
def main():
    # Create test env
    print("Creating test environment")
    test_env = gym.make(env_name)

    # Traning parameters
    lr_scheduler = Scheduler(initial_value=3e-4, interval=1000,
                             decay_factor=1)  #0.75)
    std_scheduler = Scheduler(initial_value=2.0,
                              interval=1000,
                              decay_factor=0.75)
    discount_factor = 0.99
    gae_lambda = 0.95
    ppo_epsilon = 0.2
    t_max = 10  #180
    num_epochs = 10
    batch_size = 40  #64
    save_interval = 500
    eval_interval = 100
    training = True

    # Environment constants
    frame_stack_size = 4
    input_shape = (84, 84, frame_stack_size)
    num_actions = 1  #envs.action_space.shape[0]
    action_min = np.array([-1.0])  #np.array([-1.0, 0.0, 0.0])
    action_max = np.array([1.0])  #np.array([ 1.0, 1.0, 1.0])

    # Create model
    print("Creating model")
    model_checkpoint = None  #"./models/CarRacing-v0/run2/episode0_step455000.ckpt"
    model = PPO(num_actions,
                input_shape,
                action_min,
                action_max,
                ppo_epsilon,
                value_scale=0.5,
                entropy_scale=0.0001,
                model_checkpoint=model_checkpoint,
                model_name="CarRacing-v0")

    if training:
        print("Creating environments")
        num_envs = 4
        envs = SubprocVecEnv([make_env for _ in range(num_envs)])

        initial_frames = envs.reset()
        initial_frames = envs.get_images()
        frame_stacks = [
            FrameStack(initial_frames[i], preprocess_fn=preprocess_frame)
            for i in range(num_envs)
        ]

        print("Main loop")
        step = 0
        while training:
            # While there are running environments
            print("Training...")
            states, taken_actions, values, rewards, dones = [], [], [], [], []
            learning_rate = np.maximum(lr_scheduler.get_value(), 1e-6)
            std = np.maximum(std_scheduler.get_value(), 0.2)

            # Simulate game for some number of steps
            for _ in range(t_max):
                # Predict and value action given state
                # π(a_t | s_t; θ_old)
                states_t = [
                    frame_stacks[i].get_state() for i in range(num_envs)
                ]
                actions_t, values_t = model.predict(states_t,
                                                    use_old_policy=True,
                                                    std=std)
                for i in range(num_envs):
                    actions_t[i] = 0 if actions_t[i] < 0 else 1
                actions_t = np.squeeze(actions_t.astype(np.int32), axis=-1)

                # Sample action from a Gaussian distribution
                envs.step_async(actions_t)
                frames, rewards_t, dones_t, infos = envs.step_wait()
                frames = envs.get_images()  # render

                # Store state, action and reward
                states.append(states_t)  # [T, N, 84, 84, 1]
                taken_actions.append(actions_t)  # [T, N, 3]
                values.append(np.squeeze(values_t, axis=-1))  # [T, N]
                rewards.append(rewards_t)  # [T, N]
                dones.append(dones_t)  # [T, N]

                # Get new state
                for i in range(num_envs):
                    frame_stacks[i].add_frame(frames[i])

            # Calculate last values (bootstrap values)
            states_last = [
                frame_stacks[i].get_state() for i in range(num_envs)
            ]
            last_values = np.squeeze(model.predict(states_last)[-1],
                                     axis=-1)  # [N]

            # Compute returns
            returns = compute_returns(rewards, last_values, dones,
                                      discount_factor)

            # Compute advantages
            advantages = compute_gae(rewards, values, last_values, dones,
                                     discount_factor, gae_lambda)

            # Normalize advantages
            advantages = (advantages -
                          np.mean(advantages)) / np.std(advantages)

            # Flatten arrays
            states = np.array(states).reshape(
                (-1, *input_shape))  # [T x N, 84, 84, 1]
            taken_actions = np.array(taken_actions).reshape(
                (-1, num_actions))  # [T x N, 3]
            returns = returns.flatten()  # [T x N]
            advantages = advantages.flatten()  # [T X N]

            # Train for some number of epochs
            model.update_old_policy()  # θ_old <- θ
            for _ in range(num_epochs):
                # Sample mini-batch randomly and train
                mb_idx = np.random.choice(len(states),
                                          batch_size,
                                          replace=False)

                # Optimize network
                model.train(states[mb_idx],
                            taken_actions[mb_idx],
                            returns[mb_idx],
                            advantages[mb_idx],
                            learning_rate=learning_rate,
                            std=std)

            # Reset environment's frame stack if done
            for i, done in enumerate(dones_t):
                if done:
                    frame_stacks[i].add_frame(frames[i])

            # Save model
            step += 1
            if step % save_interval == 0:
                model.save()
            if step % eval_interval == 0:
                avg_reward = evaluate(model, test_env, 10)
                model.write_to_summary("eval_avg_reward", avg_reward)

    # Training complete, evaluate model
    avg_reward = evaluate(model, test_env, 10)
    print("Model achieved a final reward of:", avg_reward)