コード例 #1
0
ファイル: eval_cartpole.py プロジェクト: jiyulongxu/Horizon
def main(model_path):
    predictor = DQNPredictor.load(model_path, "minidb", int_features=False)

    env = OpenAIGymEnvironment(gymenv=ENV)

    avg_rewards, avg_discounted_rewards = env.run_ep_n_times(AVG_OVER_NUM_EPS,
                                                             predictor,
                                                             test=True)

    logger.info(
        "Achieved an average reward score of {} over {} evaluations.".format(
            avg_rewards, AVG_OVER_NUM_EPS))
コード例 #2
0
ファイル: test_environment.py プロジェクト: sethips/Horizon
 def test_open_ai_gym_generate_samples_multi_step(self):
     env = OpenAIGymEnvironment(
         "CartPole-v0",
         epsilon=1.0,  # take random actions to collect training data
         softmax_policy=False,
         gamma=0.9,
     )
     num_samples = 1000
     num_steps = 5
     samples = env.generate_random_samples(
         num_samples, use_continuous_action=True, epsilon=1.0, multi_steps=num_steps
     )
     self._check_samples(samples, num_samples, num_steps, True)
コード例 #3
0
ファイル: eval_cartpole.py プロジェクト: zzs4026/ReAgent
def main(model_path, temperature):
    model_path = glob.glob(model_path)[0]
    predictor = DiscreteDqnTorchPredictor(torch.jit.load(model_path))
    predictor.softmax_temperature = temperature

    env = OpenAIGymEnvironment(gymenv=ENV)

    avg_rewards, avg_discounted_rewards = env.run_ep_n_times(AVG_OVER_NUM_EPS,
                                                             predictor,
                                                             test=True)

    logger.info(
        "Achieved an average reward score of {} over {} evaluations.".format(
            avg_rewards, AVG_OVER_NUM_EPS))
コード例 #4
0
ファイル: eval_cartpole.py プロジェクト: sra4077/Horizon
def main(model_path):
    predictor = DQNPredictor.load(model_path, "minidb", int_features=False)

    env = OpenAIGymEnvironment(gymenv=ENV)

    avg_rewards, avg_discounted_rewards = env.run_ep_n_times(
        AVG_OVER_NUM_EPS, predictor, test=True
    )

    logger.info(
        "Achieved an average reward score of {} over {} evaluations.".format(
            avg_rewards, AVG_OVER_NUM_EPS
        )
    )
コード例 #5
0
ファイル: mdnrnn_gym.py プロジェクト: ananthc/ReAgent
def mdnrnn_gym(params, gpu_id, feature_importance):
    logger.info("Running gym with params")
    logger.info(params)

    env_type = params["env"]
    env = OpenAIGymEnvironment(env_type,
                               epsilon=1.0,
                               softmax_policy=True,
                               gamma=0.99)

    use_gpu = gpu_id != USE_CPU
    if use_gpu:
        raise NotImplementedError()

    trainer = create_trainer(params, env)
    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id))
    _, _, trainer = train_sgd(
        c2_device,
        env,
        trainer,
        "{} test run".format(env_type),
        params["mdnrnn"]["minibatch_size"],
        **params["run_details"],
    )
    if feature_importance:
        calculate_feature_importance(env, trainer, **params["run_details"])
コード例 #6
0
ファイル: state_embed_gym.py プロジェクト: zzs4026/ReAgent
def run_gym(
    params: OpenAiGymParameters,
    score_bar,
    embed_rl_dataset: RLDataset,
    gym_env: Env,
    mdnrnn: MemoryNetwork,
    max_embed_seq_len: int,
):
    assert params.rl is not None
    rl_parameters = params.rl

    env_type = params.env
    model_type = params.model_type
    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train=True, rl_parameters=rl_parameters, params=params
    )

    replay_buffer = OpenAIGymMemoryPool(params.max_replay_memory_size)
    for row in embed_rl_dataset.rows:
        replay_buffer.insert_into_memory(**row)

    assert replay_buffer.memory_buffer is not None
    state_mem = replay_buffer.memory_buffer.state
    state_min_value = torch.min(state_mem).item()
    state_max_value = torch.max(state_mem).item()
    state_embed_env = StateEmbedGymEnvironment(
        gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value
    )
    open_ai_env = OpenAIGymEnvironment(
        state_embed_env,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
    )
    rl_trainer = create_trainer(params, open_ai_env)
    rl_predictor = create_predictor(
        rl_trainer, model_type, params.use_gpu, open_ai_env.action_dim
    )

    assert (
        params.run_details.max_steps is not None
        and params.run_details.offline_train_epochs is not None
    ), "Missing data required for offline training: {}".format(str(params.run_details))
    return train_gym_offline_rl(
        gym_env=open_ai_env,
        replay_buffer=replay_buffer,
        model_type=model_type,
        trainer=rl_trainer,
        predictor=rl_predictor,
        test_run_name="{} offline rl state embed".format(env_type),
        score_bar=score_bar,
        max_steps=params.run_details.max_steps,
        avg_over_num_episodes=params.run_details.avg_over_num_episodes,
        offline_train_epochs=params.run_details.offline_train_epochs,
        num_batch_per_epoch=None,
    )
コード例 #7
0
ファイル: run_gym.py プロジェクト: odellus/ReAgent
def run_gym(
    params: OpenAiGymParameters,
    offline_train,
    score_bar,
    seed=None,
    save_timesteps_to_dataset=None,
    start_saving_from_score=None,
    path_to_pickled_transitions=None,
    warm_trainer=None,
    reward_shape_func=None,
):
    use_gpu = params.use_gpu
    logger.info("Running gym with params")
    logger.info(params)
    assert params.rl is not None
    rl_parameters = params.rl

    env_type = params.env
    model_type = params.model_type

    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train, rl_parameters, params
    )
    env = OpenAIGymEnvironment(
        env_type,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
        seed,
    )
    replay_buffer = create_replay_buffer(
        env, params, model_type, offline_train, path_to_pickled_transitions
    )

    trainer = warm_trainer if warm_trainer else create_trainer(params, env)
    predictor = create_predictor(trainer, model_type, use_gpu, env.action_dim)

    c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU)
    return train(
        c2_device,
        env,
        offline_train,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        params.run_details,
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_score=start_saving_from_score,
        reward_shape_func=reward_shape_func,
    )
コード例 #8
0
def run_gym(
    params,
    use_gpu,
    score_bar,
    embed_rl_dataset: RLDataset,
    gym_env: Env,
    mdnrnn: MemoryNetwork,
    max_embed_seq_len: int,
):
    rl_parameters = RLParameters(**params["rl"])
    env_type = params["env"]
    model_type = params["model_type"]
    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train=True, rl_parameters=rl_parameters, params=params
    )

    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    for row in embed_rl_dataset.rows:
        replay_buffer.insert_into_memory(**row)

    state_mem = torch.cat([m[0] for m in replay_buffer.replay_memory])
    state_min_value = torch.min(state_mem).item()
    state_max_value = torch.max(state_mem).item()
    state_embed_env = StateEmbedGymEnvironment(
        gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value
    )
    open_ai_env = OpenAIGymEnvironment(
        state_embed_env,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
    )
    rl_trainer = create_trainer(
        params["model_type"], params, rl_parameters, use_gpu, open_ai_env
    )
    rl_predictor = create_predictor(
        rl_trainer, model_type, use_gpu, open_ai_env.action_dim
    )

    return train_gym_offline_rl(
        open_ai_env,
        replay_buffer,
        model_type,
        rl_trainer,
        rl_predictor,
        "{} offline rl state embed".format(env_type),
        score_bar,
        max_steps=params["run_details"]["max_steps"],
        avg_over_num_episodes=params["run_details"]["avg_over_num_episodes"],
        offline_train_epochs=params["run_details"]["offline_train_epochs"],
        bcq_imitator_hyper_params=None,
    )
コード例 #9
0
ファイル: run_gym.py プロジェクト: HaysS/Horizon-1
def run_gym(
    params,
    offline_train,
    score_bar,
    gpu_id,
    seed=None,
    save_timesteps_to_dataset=None,
    start_saving_from_score=None,
    path_to_pickled_transitions=None,
):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    model_type = params["model_type"]

    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train, rl_parameters, params)
    env = OpenAIGymEnvironment(
        env_type,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
        seed,
    )
    replay_buffer = create_replay_buffer(env, params, model_type,
                                         offline_train,
                                         path_to_pickled_transitions)

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters,
                             use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu, env.action_dim)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id))
    return train(
        c2_device,
        env,
        offline_train,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_score=start_saving_from_score,
    )
コード例 #10
0
ファイル: mdnrnn_gym.py プロジェクト: zzs4026/ReAgent
def mdnrnn_gym(
    params: OpenAiGymParameters,
    feature_importance: bool = False,
    feature_sensitivity: bool = False,
    save_embedding_to_path: Optional[str] = None,
    seed: Optional[int] = None,
):
    assert params.mdnrnn is not None
    use_gpu = params.use_gpu
    logger.info("Running gym with params")
    logger.info(params)

    env_type = params.env
    env = OpenAIGymEnvironment(
        env_type, epsilon=1.0, softmax_policy=False, gamma=0.99, random_seed=seed
    )

    # create test data once
    assert params.run_details.max_steps is not None
    test_replay_buffer = get_replay_buffer(
        params.run_details.num_test_episodes,
        params.run_details.seq_len,
        params.run_details.max_steps,
        env,
    )
    test_batch = test_replay_buffer.sample_memories(
        test_replay_buffer.memory_size, use_gpu=use_gpu, batch_first=True
    )

    trainer = create_trainer(params, env, use_gpu)
    _, _, trainer = train_sgd(
        env,
        trainer,
        use_gpu,
        "{} test run".format(env_type),
        params.mdnrnn.minibatch_size,
        params.run_details,
        test_batch=test_batch,
    )
    feature_importance_map, feature_sensitivity_map, dataset = None, None, None
    if feature_importance:
        feature_importance_map = calculate_feature_importance(
            env, trainer, use_gpu, params.run_details, test_batch=test_batch
        )
    if feature_sensitivity:
        feature_sensitivity_map = calculate_feature_sensitivity_by_actions(
            env, trainer, use_gpu, params.run_details, test_batch=test_batch
        )
    if save_embedding_to_path:
        dataset = RLDataset(save_embedding_to_path)
        create_embed_rl_dataset(env, trainer, dataset, use_gpu, params.run_details)
        dataset.save()
    return env, trainer, feature_importance_map, feature_sensitivity_map, dataset
コード例 #11
0
def run_parametric_dqn_cartpole(config):
    trainer = build_trainer(config)
    num_episodes = PARAMETRIC_DQN_CARTPOLE_NUM_EPISODES
    env = gym.make(config.env)
    wrapped_env = OpenAIGymEnvironment(config.env)
    action_shape = np.array(wrapped_env.actions).shape
    action_type = np.float32
    replay_buffer = ReplayBuffer(
        observation_shape=env.reset().shape,
        stack_size=1,
        replay_capacity=config.max_replay_memory_size,
        batch_size=trainer.minibatch_size,
        observation_dtype=np.float32,
        action_shape=action_shape,
        action_dtype=action_type,
        reward_shape=(),
        reward_dtype=np.float32,
        extra_storage_types=[
            ReplayElement("possible_actions_mask", action_shape, action_type),
            ReplayElement("log_prob", (), np.float32),
        ],
    )

    actions = wrapped_env.actions
    normalization = wrapped_env.normalization

    policy = Policy(
        scorer=parametric_dqn_scorer(len(actions), trainer.q_network),
        sampler=SoftmaxActionSampler(),
        policy_preprocessor=tiled_numpy_policy_preprocessor(len(actions)),
    )
    agent = Agent(
        policy=policy,
        action_preprocessor=discrete_action_preprocessor,
        replay_buffer=replay_buffer,
        replay_buffer_add_fn=replay_buffer_add_fn,
        replay_buffer_train_fn=replay_buffer_train_fn(
            trainer=trainer,
            trainer_preprocessor=parametric_dqn_trainer_preprocessor(
                len(actions), normalization),
            training_freq=config.run_details.train_every_ts,
            batch_size=trainer.minibatch_size,
            replay_burnin=config.run_details.train_after_ts,
        ),
    )

    reward_history = run(
        env=env,
        agent=agent,
        num_episodes=num_episodes,
        max_steps=config.run_details.max_steps,
    )
    return reward_history
コード例 #12
0
def run_gym(
    params,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
):

    # Caffe2 core uses the min of caffe2_log_level and minloglevel
    # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info.
    core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"])

    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
    )
    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    model_type = params["model_type"]

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, gpu_id
    )
    return train_sgd(
        c2_device,
        env,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
    )
コード例 #13
0
ファイル: run_gym.py プロジェクト: ystar2016/Horizon
def run_gym(
    params,
    offline_train,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    if offline_train:
        # take random actions during data collection
        epsilon = 1.0
    else:
        epsilon = rl_parameters.epsilon
    env = OpenAIGymEnvironment(
        env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma
    )
    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    model_type = params["model_type"]

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id)
    )
    return train_sgd(
        c2_device,
        env,
        offline_train,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
    )
コード例 #14
0
ファイル: run_gym.py プロジェクト: keithmgould/BlueWhale
def main(args):
    parser = argparse.ArgumentParser(
        description="Train a RL net to play in an OpenAI Gym environment.")
    parser.add_argument("-p",
                        "--parameters",
                        help="Path to JSON parameters file.")
    parser.add_argument("-s",
                        "--score-bar",
                        help="Bar for averaged tests scores.",
                        type=float,
                        default=None)
    parser.add_argument(
        "-g",
        "--gpu_id",
        help="If set, will use GPU with specified ID. Otherwise will use CPU.",
        default=USE_CPU)
    args = parser.parse_args(args)
    with open(args.parameters, 'r') as f:
        params = json.load(f)

    rl_settings = params['rl']
    training_settings = params['training']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']
    training_settings['gamma'] = training_settings['learning_rate_decay']
    del training_settings['learning_rate_decay']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])

    trainer_params = DiscreteActionModelParameters(
        actions=env.actions,
        rl=RLParameters(**rl_settings),
        training=TrainingParameters(**training_settings))

    device = core.DeviceOption(
        caffe2_pb2.CPU if args.gpu_id == USE_CPU else caffe2_pb2.CUDA,
        args.gpu_id)
    with core.DeviceScope(device):
        trainer = DiscreteActionTrainer(env.normalization,
                                        trainer_params,
                                        skip_normalization=True)
        return run(env, trainer, "{} test run".format(env_type),
                   args.score_bar, **params["run_details"])
コード例 #15
0
def mdnrnn_gym(
    params: OpenAiGymParameters,
    feature_importance: bool = False,
    feature_sensitivity: bool = False,
    save_embedding_to_path: Optional[str] = None,
):
    assert params.mdnrnn is not None
    use_gpu = params.use_gpu
    logger.info("Running gym with params")
    logger.info(params)

    env_type = params.env
    env = OpenAIGymEnvironment(env_type,
                               epsilon=1.0,
                               softmax_policy=True,
                               gamma=0.99)

    trainer = create_trainer(params, env, use_gpu)
    _, _, trainer = train_sgd(
        env,
        trainer,
        use_gpu,
        "{} test run".format(env_type),
        params.mdnrnn.minibatch_size,
        params.run_details,
    )
    feature_importance_map, feature_sensitivity_map, dataset = None, None, None
    if feature_importance:
        feature_importance_map = calculate_feature_importance(
            env, trainer, use_gpu, params.run_details)
    if feature_sensitivity:
        feature_sensitivity_map = calculate_feature_sensitivity_by_actions(
            env, trainer, use_gpu, params.run_details)
    if save_embedding_to_path:
        dataset = RLDataset(save_embedding_to_path)
        create_embed_rl_dataset(env, trainer, dataset, use_gpu,
                                params.run_details)
        dataset.save()
    return env, trainer, feature_importance_map, feature_sensitivity_map, dataset
コード例 #16
0
ファイル: run_gym.py プロジェクト: tony32769/BlueWhale
def run_gym(params, score_bar, gpu_id):
    rl_settings = params['rl']
    training_settings = params['training']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']
    training_settings['gamma'] = training_settings['learning_rate_decay']
    del training_settings['learning_rate_decay']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])
    trainer_params = DiscreteActionModelParameters(
        actions=env.actions,
        rl=RLParameters(**rl_settings),
        training=TrainingParameters(**training_settings))

    device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA,
        gpu_id,
    )
    with core.DeviceScope(device):
        if env.img:
            trainer = DiscreteActionConvTrainer(
                DiscreteActionConvModelParameters(
                    fc_parameters=trainer_params,
                    cnn_parameters=CNNModelParameters(**params['cnn']),
                    num_input_channels=env.num_input_channels,
                    img_height=env.height,
                    img_width=env.width),
                env.normalization,
            )
        else:
            trainer = DiscreteActionTrainer(
                trainer_params,
                env.normalization,
            )
        return run(env, trainer, "{} test run".format(env_type), score_bar,
                   **params["run_details"])
コード例 #17
0
ファイル: run_gym.py プロジェクト: darbour/BlueWhale
def run_gym(params, score_bar, gpu_id, save_timesteps_to_dataset=None):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        params["max_replay_memory_size"],
    )
    model_type = params["model_type"]
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id
    )

    if model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels
                )
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions, rl=rl_parameters, training=training_parameters
            )
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(
                trainer_params, env.normalization, env.normalization_action
            )
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params["shared_training"]
        actor_settings = params["actor_training"]
        critic_settings = params["critic_training"]
        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )

        # DDPG can handle continuous and discrete action spaces
        if env.action_type == EnvType.CONTINUOUS_ACTION:
            action_range = env.action_space.high
        else:
            action_range = None

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            use_gpu=False,
            action_range=action_range,
        )

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return run(
        c2_device,
        env,
        model_type,
        trainer,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
    )
コード例 #18
0
ファイル: run_gym.py プロジェクト: zzs4026/ReAgent
def train_gym_offline_rl(
    gym_env: OpenAIGymEnvironment,
    replay_buffer: OpenAIGymMemoryPool,
    model_type: str,
    trainer: RLTrainer,
    predictor: OnPolicyPredictor,
    test_run_name: str,
    score_bar: Optional[float],
    max_steps: int,
    avg_over_num_episodes: int,
    offline_train_epochs: int,
    num_batch_per_epoch: Optional[int],
    bcq_imitator_hyper_params: Optional[Dict[str, Any]] = None,
):
    if num_batch_per_epoch is None:
        num_batch_per_epoch = replay_buffer.size // trainer.minibatch_size
    assert num_batch_per_epoch > 0, "The size of replay buffer is not sufficient"

    logger.info(
        "{} offline transitions in replay buffer.\n"
        "Training will take {} epochs, with each epoch having {} mini-batches"
        " and each mini-batch having {} samples".format(
            replay_buffer.size,
            offline_train_epochs,
            num_batch_per_epoch,
            trainer.minibatch_size,
        )
    )

    avg_reward_history, epoch_history = [], []

    # Pre-train a GBDT imitator if doing batch constrained q-learning in Gym
    if getattr(trainer, "bcq", None):
        assert bcq_imitator_hyper_params is not None
        gbdt = GradientBoostingClassifier(
            n_estimators=bcq_imitator_hyper_params["gbdt_trees"],
            max_depth=bcq_imitator_hyper_params["max_depth"],
        )
        samples = replay_buffer.sample_memories(replay_buffer.size, model_type)
        X, y = samples.states.numpy(), torch.max(samples.actions, dim=1)[1].numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        logger.info("Fitting GBDT...")
        gbdt.fit(X_train, y_train)
        train_score = round(gbdt.score(X_train, y_train) * 100, 1)
        test_score = round(gbdt.score(X_test, y_test) * 100, 1)
        logger.info(
            "GBDT train accuracy {}% || test accuracy {}%".format(
                train_score, test_score
            )
        )
        trainer.bcq_imitator = gbdt.predict_proba  # type: ignore

    # Offline training
    for i_epoch in range(offline_train_epochs):
        for _ in range(num_batch_per_epoch):
            samples = replay_buffer.sample_memories(trainer.minibatch_size, model_type)
            samples.set_device(trainer.device)
            trainer.train(samples)

        batch_td_loss = float(
            torch.mean(
                torch.tensor(
                    [stat.td_loss for stat in trainer.loss_reporter.incoming_stats]
                )
            )
        )
        trainer.loss_reporter.flush()
        logger.info(
            "Average TD loss: {} in epoch {}".format(batch_td_loss, i_epoch + 1)
        )

        # test model performance for this epoch
        avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times(
            avg_over_num_episodes, predictor, test=True, max_steps=max_steps
        )
        avg_reward_history.append(avg_rewards)

        # For offline training, use epoch number as timestep history since
        # we have a fixed batch of data to count epochs over.
        epoch_history.append(i_epoch)
        logger.info(
            "Achieved an average reward score of {} over {} evaluations"
            " after epoch {}.".format(avg_rewards, avg_over_num_episodes, i_epoch)
        )
        if score_bar is not None and avg_rewards > score_bar:
            logger.info(
                "Avg. reward history for {}: {}".format(
                    test_run_name, avg_reward_history
                )
            )
            return avg_reward_history, epoch_history, trainer, predictor, gym_env

    logger.info(
        "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history)
    )
    return avg_reward_history, epoch_history, trainer, predictor, gym_env
コード例 #19
0
def build_trainer(config):
    return create_trainer(config, OpenAIGymEnvironment(config.env))
コード例 #20
0
ファイル: test_environment.py プロジェクト: ananthc/ReAgent
    def test_open_ai_gym_generate_samples_multi_step(self):
        env = OpenAIGymEnvironment(
            "CartPole-v0",
            epsilon=1.0,  # take random actions to collect training data
            softmax_policy=False,
            gamma=0.9,
        )
        num_samples = 1000
        num_steps = 5
        samples = env.generate_random_samples(num_samples,
                                              use_continuous_action=True,
                                              epsilon=1.0,
                                              multi_steps=num_steps)
        for i in range(num_samples):
            if samples.terminals[i][0]:
                break
            if i < num_samples - 1:
                self.assertEqual(samples.mdp_ids[i], samples.mdp_ids[i + 1])
                self.assertEqual(samples.sequence_numbers[i] + 1,
                                 samples.sequence_numbers[i + 1])
            for j in range(len(samples.terminals[i])):
                self.assertEqual(samples.rewards[i][j],
                                 samples.rewards[i + j][0])
                self.assertDictEqual(samples.next_states[i][j],
                                     samples.next_states[i + j][0])
                self.assertDictEqual(samples.next_actions[i][j],
                                     samples.next_actions[i + j][0])
                self.assertEqual(samples.terminals[i][j],
                                 samples.terminals[i + j][0])
                self.assertListEqual(
                    samples.possible_next_actions[i][j],
                    samples.possible_next_actions[i + j][0],
                )
                if samples.terminals[i][j]:
                    continue
                self.assertDictEqual(samples.next_states[i][j],
                                     samples.states[i + j + 1])
                self.assertDictEqual(samples.next_actions[i][j],
                                     samples.actions[i + j + 1])
                self.assertListEqual(
                    samples.possible_next_actions[i][j],
                    samples.possible_actions[i + j + 1],
                )

        single_step_samples = samples.to_single_step()
        for i in range(num_samples):
            if single_step_samples.terminals[i] is True:
                break
            self.assertEqual(single_step_samples.mdp_ids[i],
                             samples.mdp_ids[i])
            self.assertEqual(single_step_samples.sequence_numbers[i],
                             samples.sequence_numbers[i])
            self.assertDictEqual(single_step_samples.states[i],
                                 samples.states[i])
            self.assertDictEqual(single_step_samples.actions[i],
                                 samples.actions[i])
            self.assertEqual(
                single_step_samples.action_probabilities[i],
                samples.action_probabilities[i],
            )
            self.assertEqual(single_step_samples.rewards[i],
                             samples.rewards[i][0])
            self.assertListEqual(single_step_samples.possible_actions[i],
                                 samples.possible_actions[i])
            self.assertDictEqual(single_step_samples.next_states[i],
                                 samples.next_states[i][0])
            self.assertDictEqual(single_step_samples.next_actions[i],
                                 samples.next_actions[i][0])
            self.assertEqual(single_step_samples.terminals[i],
                             samples.terminals[i][0])
            self.assertListEqual(
                single_step_samples.possible_next_actions[i],
                samples.possible_next_actions[i][0],
            )
コード例 #21
0
def multi_step_sample_generator(
    gym_env: OpenAIGymEnvironment,
    num_transitions: int,
    max_steps: Optional[int],
    multi_steps: int,
    include_shorter_samples_at_start: bool,
    include_shorter_samples_at_end: bool,
):
    """
    Convert gym env multi-step sample format to mdn-rnn multi-step sample format

    :param gym_env: The environment used to generate multi-step samples
    :param num_transitions: # of samples to return
    :param max_steps: An episode terminates when the horizon is beyond max_steps
    :param multi_steps: # of steps of states and actions per sample
    :param include_shorter_samples_at_start: Whether to keep samples of shorter steps
        which are generated at the beginning of an episode
    :param include_shorter_samples_at_end: Whether to keep samples of shorter steps
        which are generated at the end of an episode
    """
    samples = gym_env.generate_random_samples(
        num_transitions=num_transitions,
        use_continuous_action=True,
        max_step=max_steps,
        multi_steps=multi_steps,
        include_shorter_samples_at_start=include_shorter_samples_at_start,
        include_shorter_samples_at_end=include_shorter_samples_at_end,
    )

    for j in range(num_transitions):
        sample_steps = len(samples.terminals[j])  # type: ignore
        state = dict_to_np(samples.states[j],
                           np_size=gym_env.state_dim,
                           key_offset=0)
        action = dict_to_np(samples.actions[j],
                            np_size=gym_env.action_dim,
                            key_offset=gym_env.state_dim)
        next_actions = np.float32(  # type: ignore
            [
                dict_to_np(
                    samples.next_actions[j][k],
                    np_size=gym_env.action_dim,
                    key_offset=gym_env.state_dim,
                ) for k in range(sample_steps)
            ])
        next_states = np.float32(  # type: ignore
            [
                dict_to_np(samples.next_states[j][k],
                           np_size=gym_env.state_dim,
                           key_offset=0) for k in range(sample_steps)
            ])
        rewards = np.float32(samples.rewards[j])  # type: ignore
        terminals = np.float32(samples.terminals[j])  # type: ignore
        not_terminals = np.logical_not(terminals)
        ordered_states = np.vstack((state, next_states))
        ordered_actions = np.vstack((action, next_actions))
        mdnrnn_states = ordered_states[:-1]
        mdnrnn_actions = ordered_actions[:-1]
        mdnrnn_next_states = ordered_states[-multi_steps:]
        mdnrnn_next_actions = ordered_actions[-multi_steps:]

        # Padding zeros so that all samples have equal steps
        # The general rule is to pad zeros at the end of sequences.
        # In addition, if the sequence only has one step (i.e., the
        # first state of an episode), pad one zero row ahead of the
        # sequence, which enables embedding generated properly for
        # one-step samples
        num_padded_top_rows = 1 if multi_steps > 1 and sample_steps == 1 else 0
        num_padded_bottom_rows = multi_steps - sample_steps - num_padded_top_rows
        sample_steps_next = len(mdnrnn_next_states)
        num_padded_top_rows_next = 0
        num_padded_bottom_rows_next = multi_steps - sample_steps_next
        yield (
            np.pad(
                mdnrnn_states,
                ((num_padded_top_rows, num_padded_bottom_rows), (0, 0)),
                "constant",
                constant_values=0.0,
            ),
            np.pad(
                mdnrnn_actions,
                ((num_padded_top_rows, num_padded_bottom_rows), (0, 0)),
                "constant",
                constant_values=0.0,
            ),
            np.pad(
                rewards,
                ((num_padded_top_rows, num_padded_bottom_rows)),
                "constant",
                constant_values=0.0,
            ),
            np.pad(
                mdnrnn_next_states,
                ((num_padded_top_rows_next, num_padded_bottom_rows_next),
                 (0, 0)),
                "constant",
                constant_values=0.0,
            ),
            np.pad(
                mdnrnn_next_actions,
                ((num_padded_top_rows_next, num_padded_bottom_rows_next),
                 (0, 0)),
                "constant",
                constant_values=0.0,
            ),
            np.pad(
                not_terminals,
                ((num_padded_top_rows, num_padded_bottom_rows)),
                "constant",
                constant_values=0.0,
            ),
            sample_steps,
            sample_steps_next,
        )
コード例 #22
0
ファイル: run_gym.py プロジェクト: onisimchukv/BlueWhale
def run_gym(
    params,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
    batch_rl_file_path=None,
):

    # Caffe2 core uses the min of caffe2_log_level and minloglevel
    # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info.
    core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"])

    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        params["max_replay_memory_size"],
        rl_parameters.gamma,
    )
    model_type = params["model_type"]
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id)
    use_gpu = gpu_id != USE_CPU

    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"])
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels)
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters)
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (training_parameters.cnn_parameters
                        is not None), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"])
                training_parameters.cnn_parameters.conv_dims[
                    0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels)
            else:
                assert (training_parameters.cnn_parameters is
                        None), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions,
                rl=rl_parameters,
                training=training_parameters)
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"])
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            knn=KnnParameters(model_type="DQN"),
        )
        trainer = ParametricDQNTrainer(trainer_params, env.normalization,
                                       env.normalization_action, use_gpu)
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (training_parameters.cnn_parameters
                        is not None), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"])
                training_parameters.cnn_parameters.conv_dims[
                    0] = env.num_input_channels
            else:
                assert (training_parameters.cnn_parameters is
                        None), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(trainer_params,
                                                 env.normalization,
                                                 env.normalization_action)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params["shared_training"]
        actor_settings = params["actor_training"]
        critic_settings = params["critic_training"]
        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return run(
        c2_device,
        env,
        model_type,
        trainer,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
        batch_rl_file_path=batch_rl_file_path,
    )
コード例 #23
0
def run_gym(params, score_bar, gpu_id):
    rl_settings = params['rl']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])
    model_type = params['model_type']
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA,
        gpu_id,
    )

    if model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params['training']
            training_settings['gamma'] = training_settings[
                'learning_rate_decay']
            del training_settings['learning_rate_decay']
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions,
                rl=RLParameters(**rl_settings),
                training=TrainingParameters(**training_settings))
            if env.img:
                trainer = DiscreteActionConvTrainer(
                    DiscreteActionConvModelParameters(
                        fc_parameters=trainer_params,
                        cnn_parameters=CNNModelParameters(**params['cnn']),
                        num_input_channels=env.num_input_channels,
                        img_height=env.height,
                        img_width=env.width),
                    env.normalization,
                )
            else:
                trainer = DiscreteActionTrainer(
                    trainer_params,
                    env.normalization,
                )
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params['training']
            training_settings['gamma'] = training_settings[
                'learning_rate_decay']
            del training_settings['learning_rate_decay']
            trainer_params = ContinuousActionModelParameters(
                rl=RLParameters(**rl_settings),
                training=TrainingParameters(**training_settings),
                knn=KnnParameters(model_type='DQN', ),
            )
            trainer = ContinuousActionDQNTrainer(trainer_params,
                                                 env.normalization,
                                                 env.normalization_action)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params['shared_training']
        training_settings['gamma'] = training_settings['learning_rate_decay']
        del training_settings['learning_rate_decay']
        actor_settings = params['actor_training']
        critic_settings = params['critic_training']
        trainer_params = DDPGModelParameters(
            rl=DDPGRLParameters(**rl_settings),
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )
        trainer = DDPGTrainer(
            trainer_params,
            EnvDetails(
                state_dim=env.state_dim,
                action_dim=env.action_dim,
                action_range=(env.action_space.low, env.action_space.high),
            ))
    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return run(env, model_type, trainer, "{} test run".format(env_type),
               score_bar, **params["run_details"])