Beispiel #1
0
def create_predictor(trainer, model_type, use_gpu, action_dim=None):
    if model_type == ModelType.SOFT_ACTOR_CRITIC.value:
        predictor = GymSACPredictor(trainer, action_dim)
    elif model_type in (
        ModelType.PYTORCH_DISCRETE_DQN.value,
        ModelType.PYTORCH_PARAMETRIC_DQN.value,
    ):
        predictor = GymDQNPredictor(trainer, action_dim)
    else:
        raise NotImplementedError()
    return predictor
Beispiel #2
0
def create_predictor(trainer, model_type, use_gpu):
    if model_type == ModelType.CONTINUOUS_ACTION.value:
        predictor = GymDDPGPredictor(trainer)
    elif model_type in (
            ModelType.PYTORCH_DISCRETE_DQN.value,
            ModelType.PYTORCH_PARAMETRIC_DQN.value,
    ):
        predictor = GymDQNPredictor(trainer)
    else:
        raise NotImplementedError()
    return predictor
Beispiel #3
0
def create_predictor(trainer, model_type, use_gpu):
    c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU)
    if model_type == ModelType.CONTINUOUS_ACTION.value:
        predictor = GymDDPGPredictor(trainer)
    elif model_type in (
        ModelType.PYTORCH_DISCRETE_DQN.value,
        ModelType.PYTORCH_PARAMETRIC_DQN.value,
    ):
        predictor = GymDQNPredictorPytorch(trainer)
    else:
        predictor = GymDQNPredictor(trainer, c2_device)
    return predictor
Beispiel #4
0
def run(
    c2_device,
    gym_env,
    model_type,
    trainer,
    test_run_name,
    score_bar,
    num_episodes=301,
    max_steps=None,
    train_every_ts=100,
    train_after_ts=10,
    test_every_ts=100,
    test_after_ts=10,
    num_train_batches=10,
    avg_over_num_episodes=100,
    render=False,
    render_every=10,
    save_timesteps_to_dataset=None,
):
    avg_reward_history = []

    if model_type == ModelType.CONTINUOUS_ACTION.value:
        predictor = GymDDPGPredictor(trainer)
    else:
        predictor = GymDQNPredictor(trainer, c2_device)

    total_timesteps = 0

    for i in range(num_episodes):
        terminal = False
        next_state = gym_env.transform_state(gym_env.env.reset())
        next_action = gym_env.policy(predictor, next_state, False)
        reward_sum = 0
        ep_timesteps = 0

        if model_type == ModelType.CONTINUOUS_ACTION.value:
            trainer.noise.clear()

        while not terminal:
            state = next_state
            action = next_action

            if render:
                gym_env.env.render()

            if gym_env.action_type == EnvType.DISCRETE_ACTION:
                action_index = np.argmax(action)
                next_state, reward, terminal, _ = gym_env.env.step(action_index)
            else:
                next_state, reward, terminal, _ = gym_env.env.step(action)
            next_state = gym_env.transform_state(next_state)

            ep_timesteps += 1
            total_timesteps += 1
            next_action = gym_env.policy(predictor, next_state, False)
            reward_sum += reward

            (possible_next_actions,
             possible_next_actions_lengths) = get_possible_next_actions(
                gym_env, model_type, terminal)

            gym_env.insert_into_memory(
                np.float32(state),
                action,
                np.float32(reward),
                np.float32(next_state),
                next_action,
                terminal,
                possible_next_actions,
                possible_next_actions_lengths,
                1,
            )

            if save_timesteps_to_dataset:
                # TODO: handle continuous/parametric actions.
                assert (
                    gym_env.action_type == EnvType.DISCRETE_ACTION
                ), "Save to file supports discrete actions only."
                action_str = str(np.argmax(action).item())
                possible_actions = [str(a) for a in range(gym_env.action_dim)]
                save_timesteps_to_dataset.insert(
                    i,
                    ep_timesteps - 1,
                    np.float32(state).tolist(),
                    action_str,
                    np.float32(reward).item(),
                    possible_actions,
                )
                if terminal:
                    save_timesteps_to_dataset.insert(
                        i,
                        ep_timesteps,
                        np.float32(next_state).tolist(),
                        None,
                        0.0,
                        [],
                    )

            # Training loop
            if (
                total_timesteps % train_every_ts == 0
                and total_timesteps > train_after_ts
                and len(gym_env.replay_memory) >= trainer.minibatch_size
            ):
                for _ in range(num_train_batches):
                    if model_type == ModelType.CONTINUOUS_ACTION.value:
                        samples = gym_env.sample_memories(trainer.minibatch_size)
                        trainer.train(samples)
                    else:
                        with core.DeviceScope(c2_device):
                            gym_env.sample_and_load_training_data_c2(
                                trainer.minibatch_size,
                                model_type,
                                trainer.maxq_learning,
                            )
                            trainer.train()

            # Evaluation loop
            if total_timesteps % test_every_ts == 0 and total_timesteps > test_after_ts:
                avg_rewards = gym_env.run_ep_n_times(
                    avg_over_num_episodes, predictor, test=True
                )
                avg_reward_history.append(avg_rewards)
                logger.info(
                    "Achieved an average reward score of {} over {} evaluations."
                    " Total episodes: {}, total timesteps: {}.".format(
                        avg_rewards, avg_over_num_episodes, i + 1, total_timesteps
                    )
                )
                if score_bar is not None and avg_rewards > score_bar:
                    logger.info(
                        "Avg. reward history for {}: {}".format(
                            test_run_name, avg_reward_history
                        )
                    )
                    return avg_reward_history

            if max_steps and ep_timesteps >= max_steps:
                break

        # Always eval on last episode if previous eval loop didn't return.
        if i == num_episodes - 1:
            avg_rewards = gym_env.run_ep_n_times(
                avg_over_num_episodes, predictor, test=True
            )
            avg_reward_history.append(avg_rewards)
            logger.info(
                "Achieved an average reward score of {} over {} evaluations."
                " Total episodes: {}, total timesteps: {}.".format(
                    avg_rewards, avg_over_num_episodes, i + 1, total_timesteps
                )
            )

    logger.info(
        "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history)
    )
    return avg_reward_history
Beispiel #5
0
def run(
    c2_device,
    gym_env,
    model_type,
    trainer,
    test_run_name,
    score_bar,
    num_episodes=301,
    max_steps=None,
    train_every_ts=100,
    train_after_ts=10,
    test_every_ts=100,
    test_after_ts=10,
    num_train_batches=1,
    avg_over_num_episodes=100,
    render=False,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
    batch_rl_file_path=None,
):

    if model_type == ModelType.CONTINUOUS_ACTION.value:
        predictor = GymDDPGPredictor(trainer)
    elif model_type in (
            ModelType.PYTORCH_DISCRETE_DQN.value,
            ModelType.PYTORCH_PARAMETRIC_DQN.value,
    ):
        predictor = GymDQNPredictorPytorch(trainer)
    else:
        predictor = GymDQNPredictor(trainer, c2_device)

    if batch_rl_file_path is not None:
        return train_gym_batch_rl(
            model_type,
            trainer,
            predictor,
            batch_rl_file_path,
            gym_env,
            num_train_batches,
            test_every_ts,
            test_after_ts,
            avg_over_num_episodes,
            score_bar,
            test_run_name,
        )

    else:
        return train_gym_online_rl(
            c2_device,
            gym_env,
            model_type,
            trainer,
            predictor,
            test_run_name,
            score_bar,
            num_episodes,
            max_steps,
            train_every_ts,
            train_after_ts,
            test_every_ts,
            test_after_ts,
            num_train_batches,
            avg_over_num_episodes,
            render,
            save_timesteps_to_dataset,
            start_saving_from_episode,
        )
 def __init__(self, trainer, action_dim):
     GymDQNPredictor.__init__(self, trainer, action_dim)
def run(
    c2_device,
    gym_env,
    model_type,
    trainer,
    test_run_name,
    score_bar,
    num_episodes=301,
    max_steps=None,
    train_every_ts=100,
    train_after_ts=10,
    test_every_ts=100,
    test_after_ts=10,
    num_train_batches=10,
    avg_over_num_episodes=100,
    render=False,
    render_every=10,
):
    avg_reward_history = []

    if model_type == ModelType.CONTINUOUS_ACTION.value:
        predictor = GymDDPGPredictor(trainer)
    else:
        predictor = GymDQNPredictor(trainer, c2_device)

    total_timesteps = 0

    for i in range(num_episodes):
        terminal = False
        next_state = gym_env.transform_state(gym_env.env.reset())
        next_action = gym_env.policy(predictor, next_state, False)
        reward_sum = 0
        ep_timesteps = 0

        if model_type == ModelType.CONTINUOUS_ACTION.value:
            trainer.noise.clear()

        while not terminal:
            state = next_state
            action = next_action

            if render:
                gym_env.env.render()

            if gym_env.action_type == EnvType.DISCRETE_ACTION:
                action_index = np.argmax(action)
                next_state, reward, terminal, _ = gym_env.env.step(action_index)
            else:
                next_state, reward, terminal, _ = gym_env.env.step(action)
            next_state = gym_env.transform_state(next_state)

            ep_timesteps += 1
            total_timesteps += 1
            next_action = gym_env.policy(predictor, next_state, False)
            reward_sum += reward

            if model_type == ModelType.DISCRETE_ACTION.value:
                possible_next_actions = [
                    0 if terminal else 1 for __ in range(gym_env.action_dim)
                ]
                possible_next_actions_lengths = gym_env.action_dim
            elif model_type == ModelType.PARAMETRIC_ACTION.value:
                if terminal:
                    possible_next_actions = np.array([])
                    possible_next_actions_lengths = 0
                else:
                    possible_next_actions = np.eye(gym_env.action_dim)
                    possible_next_actions_lengths = gym_env.action_dim
            elif model_type == ModelType.CONTINUOUS_ACTION.value:
                possible_next_actions = None
                possible_next_actions_lengths = None

            gym_env.insert_into_memory(
                np.float32(state),
                action,
                np.float32(reward),
                np.float32(next_state),
                next_action,
                terminal,
                possible_next_actions,
                possible_next_actions_lengths,
                1,
            )

            # Training loop
            if (
                total_timesteps % train_every_ts == 0
                and total_timesteps > train_after_ts
                and len(gym_env.replay_memory) >= trainer.minibatch_size
            ):
                for _ in range(num_train_batches):
                    if model_type == ModelType.CONTINUOUS_ACTION.value:
                        samples = gym_env.sample_memories(trainer.minibatch_size)
                        trainer.train(samples)
                    else:
                        with core.DeviceScope(c2_device):
                            gym_env.sample_and_load_training_data_c2(
                                trainer.minibatch_size,
                                model_type,
                                trainer.maxq_learning,
                            )
                            trainer.train(episode_values=None, evaluator=None)

            # Evaluation loop
            if total_timesteps % test_every_ts == 0 and total_timesteps > test_after_ts:
                avg_rewards = gym_env.run_ep_n_times(
                    avg_over_num_episodes, predictor, test=True
                )
                avg_reward_history.append(avg_rewards)
                logger.info(
                    "Achieved an average reward score of {} over {} evaluations."
                    " Total episodes: {}, total timesteps: {}.".format(
                        avg_rewards, avg_over_num_episodes, i + 1, total_timesteps
                    )
                )
                if score_bar is not None and avg_rewards > score_bar:
                    logger.info(
                        "Avg. reward history for {}: {}".format(
                            test_run_name, avg_reward_history
                        )
                    )
                    return avg_reward_history

            if max_steps and ep_timesteps >= max_steps:
                break

        # Always eval on last episode if previous eval loop didn't return.
        if i == num_episodes - 1:
            avg_rewards = gym_env.run_ep_n_times(
                avg_over_num_episodes, predictor, test=True
            )
            avg_reward_history.append(avg_rewards)
            logger.info(
                "Achieved an average reward score of {} over {} evaluations."
                " Total episodes: {}, total timesteps: {}.".format(
                    avg_rewards, avg_over_num_episodes, i + 1, total_timesteps
                )
            )

    logger.info(
        "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history)
    )
    return avg_reward_history