def __init__(
     self,
     parameters,
     normalization_parameters: Dict[int, NormalizationParameters],
 ) -> None:
     self._quantile_states: Any = collections.deque(
         maxlen=parameters.action_budget.window_size
     )
     self._quantile = 100 - parameters.action_budget.action_limit
     self.quantile_value = 0
     self._limited_action = np.argmax(
         np.array(parameters.actions) ==
         parameters.action_budget.limited_action
     )
     self._discount_factor = parameters.rl.gamma
     self._quantile_update_rate = \
         parameters.action_budget.quantile_update_rate
     self._quantile_update_frequency = \
         parameters.action_budget.quantile_update_frequency
     self._update_counter = 0
     DiscreteActionTrainer.__init__(
         self,
         parameters,
         normalization_parameters,
     )
     self._max_q = parameters.rl.maxq_learning
Example #2
0
    def test_trainer_maxq(self):
        env = Env(self.state_dims, self.action_dims)
        env.seed(42)
        maxq_parameters = DiscreteActionModelParameters(
            actions=env.actions,
            rl=RLParameters(
                gamma=0.99,
                target_update_rate=1.0,
                reward_burnin=100,
                maxq_learning=True,
            ),
            training=TrainingParameters(
                layers=self.layers,
                activations=self.activations,
                minibatch_size=self.minibatch_size,
                learning_rate=1.0,
                optimizer="ADAM",
            ),
        )
        maxq_trainer = DiscreteActionTrainer(maxq_parameters,
                                             env.normalization)
        # predictor = maxq_trainer.predictor()

        logger.info("Generating constant_reward MDPs..")

        states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions = env.generate_samples_discrete(
            self.num_samples)

        logger.info("Preprocessing constant_reward MDPs..")

        tdps = env.preprocess_samples_discrete(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            self.minibatch_size,
        )

        for epoch in range(self.epochs):
            logger.info("Training.. " + str(epoch))
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)
            logger.info(" ".join([
                "Training epoch",
                str(epoch),
                "average q values",
                str(np.mean(workspace.FetchBlob(maxq_trainer.q_score_output))),
                "td_loss",
                str(workspace.FetchBlob(maxq_trainer.loss_blob)),
            ]))

        # Q value should converge to very close to 100
        avg_q_value_after_training = np.mean(
            workspace.FetchBlob(maxq_trainer.q_score_output))

        self.assertLess(avg_q_value_after_training, 101)
        self.assertGreater(avg_q_value_after_training, 99)
Example #3
0
    def test_pure_q_learning_all_cheat(self):
        q_learning_parameters = DiscreteActionModelParameters(
            actions=self._env.ACTIONS,
            rl=self._rl_parameters_all_cheat_maxq,
            training=TrainingParameters(
                layers=[self._env.width * self._env.height, 1],
                activations=['linear'],
                minibatch_size=self.minibatch_size,
                learning_rate=0.05,
                optimizer='SGD',
                lr_policy='fixed',
            )
        )

        trainer = DiscreteActionTrainer(
            q_learning_parameters,
            self._env.normalization,
        )

        predictor = trainer.predictor()

        policy = _build_policy(self._env, predictor, 1)
        initial_state = self._env.reset()
        iteration_result = _collect_samples(
            self._env, policy, 20000, initial_state
        )
        num_iterations = 50
        for _ in range(num_iterations):
            tdps = self._env.preprocess_samples(
                iteration_result.states,
                iteration_result.actions,
                iteration_result.rewards,
                iteration_result.next_states,
                iteration_result.next_actions,
                iteration_result.is_terminals,
                iteration_result.possible_next_actions,
                None,
                self.minibatch_size,
            )
            for tdp in tdps:
                trainer.train_numpy(tdp, None)
            initial_state = self._env.reset()
            policy = _build_policy(self._env, predictor, 0.1)
            iteration_result = _collect_samples(
                self._env, policy, 20000, initial_state
            )
        policy = _build_policy(self._env, predictor, 0)
        initial_state = self._env.reset()
        iteration_result = _collect_samples(
            self._env, policy, 1000, initial_state
        )
        # 100% should be cheat.  Will fix in the future.
        self.assertGreater(
            np.sum(np.array(iteration_result.actions) == 'C'), 800
        )
Example #4
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=["linear"],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer="ADAM",
            ),
        )
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters, environment.normalization
        )

        samples = environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        evaluator = GridworldEvaluator(environment, True)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.3)

        for _ in range(5):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)

        self.assertGreater(
            evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2]
        )
Example #5
0
 def get_sarsa_trainer_reward_boost(self, environment, reward_shape):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=1.0,
         reward_burnin=10,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, -1],
         activations=["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.125,
         optimizer="ADAM",
     )
     return DiscreteActionTrainer(
         DiscreteActionModelParameters(
             actions=environment.ACTIONS,
             rl=rl_parameters,
             training=training_parameters,
             rainbow=RainbowDQNParameters(double_q_learning=True,
                                          dueling_architecture=False),
             in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1),
         ),
         environment.normalization,
     )
Example #6
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(gamma=DISCOUNT,
                            target_update_rate=0.5,
                            reward_burnin=10,
                            maxq_learning=True),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=['linear'],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer='ADAM',
            ))
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters,
            environment.normalization,
        )
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        evaluator = GridworldEvaluator(environment, True)
        print("Pre-Training eval", evaluator.evaluate(predictor))
        self.assertGreater(evaluator.evaluate(predictor), 0.3)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.stream_tdp(tdp, None)
            evaluator.evaluate(predictor)

        print("Post-Training eval", evaluator.evaluate(predictor))
        self.assertLess(evaluator.evaluate(predictor), 0.1)
    def test_pure_q_learning_all_cheat(self):
        q_learning_parameters = DiscreteActionModelParameters(
            actions=self._env.ACTIONS,
            rl=self._rl_parameters_all_cheat_maxq,
            training=TrainingParameters(
                layers=[self._env.width * self._env.height, 1],
                activations=['linear'],
                minibatch_size=32,
                learning_rate=0.05,
                optimizer='SGD',
                lr_policy='fixed'))

        trainer = DiscreteActionTrainer(self._env.normalization,
                                        q_learning_parameters)

        predictor = trainer.predictor()

        policy = _build_policy(self._env, predictor, 1)
        initial_state = self._env.reset()
        iteration_result = _collect_samples(self._env, policy, 10000,
                                            initial_state)
        num_iterations = 50
        for _ in range(num_iterations):
            policy = _build_policy(self._env, predictor, 0)

            tdp = self._env.preprocess_samples(
                iteration_result.states,
                iteration_result.actions,
                iteration_result.rewards,
                iteration_result.next_states,
                iteration_result.next_actions,
                iteration_result.is_terminals,
                iteration_result.possible_next_actions,
                None,
            )
            trainer.stream_tdp(tdp, None)
            initial_state = iteration_result.current_state
        initial_state = self._env.reset()
        iteration_result = _collect_samples(self._env, policy, 10000,
                                            initial_state)
        self.assertTrue(np.all(np.array(iteration_result.actions) == 'C'))
Example #8
0
 def get_sarsa_trainer(self, environment):
     rl_parameters = RLParameters(gamma=DISCOUNT,
                                  target_update_rate=0.5,
                                  reward_burnin=10,
                                  maxq_learning=False)
     training_parameters = TrainingParameters(
         layers=[-1, 1],
         activations=['linear'],
         minibatch_size=1024,
         learning_rate=0.01,
         optimizer='ADAM',
     )
     return DiscreteActionTrainer(
         environment.normalization,
         DiscreteActionModelParameters(actions=environment.ACTIONS,
                                       rl=rl_parameters,
                                       training=training_parameters))
Example #9
0
 def test_sarsa_layer_validation(self):
     env = Gridworld()
     invalid_sarsa_params = DiscreteActionModelParameters(
         actions=env.ACTIONS,
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=0.5,
                         reward_burnin=10,
                         maxq_learning=False),
         training=TrainingParameters(
             layers=[-1, 3],
             activations=['linear'],
             minibatch_size=32,
             learning_rate=0.1,
             optimizer='SGD',
         ))
     with self.assertRaises(Exception):
         # layers[-1] should be 1
         DiscreteActionTrainer(env.normalization, invalid_sarsa_params)
Example #10
0
def main(args):
    parser = argparse.ArgumentParser(
        description="Train a RL net to play in an OpenAI Gym environment.")
    parser.add_argument("-p",
                        "--parameters",
                        help="Path to JSON parameters file.")
    parser.add_argument("-s",
                        "--score-bar",
                        help="Bar for averaged tests scores.",
                        type=float,
                        default=None)
    parser.add_argument(
        "-g",
        "--gpu_id",
        help="If set, will use GPU with specified ID. Otherwise will use CPU.",
        default=USE_CPU)
    args = parser.parse_args(args)
    with open(args.parameters, 'r') as f:
        params = json.load(f)

    rl_settings = params['rl']
    training_settings = params['training']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']
    training_settings['gamma'] = training_settings['learning_rate_decay']
    del training_settings['learning_rate_decay']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])

    trainer_params = DiscreteActionModelParameters(
        actions=env.actions,
        rl=RLParameters(**rl_settings),
        training=TrainingParameters(**training_settings))

    device = core.DeviceOption(
        caffe2_pb2.CPU if args.gpu_id == USE_CPU else caffe2_pb2.CUDA,
        args.gpu_id)
    with core.DeviceScope(device):
        trainer = DiscreteActionTrainer(env.normalization,
                                        trainer_params,
                                        skip_normalization=True)
        return run(env, trainer, "{} test run".format(env_type),
                   args.score_bar, **params["run_details"])
Example #11
0
 def get_sarsa_trainer_reward_boost(self, environment, reward_shape):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=0.5,
         reward_burnin=10,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, -1],
         activations=["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.01,
         optimizer="ADAM",
     )
     return DiscreteActionTrainer(
         DiscreteActionModelParameters(
             actions=environment.ACTIONS,
             rl=rl_parameters,
             training=training_parameters,
         ),
         environment.normalization,
     )
Example #12
0
def run_gym(params, score_bar, gpu_id):
    rl_settings = params['rl']
    training_settings = params['training']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']
    training_settings['gamma'] = training_settings['learning_rate_decay']
    del training_settings['learning_rate_decay']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])
    trainer_params = DiscreteActionModelParameters(
        actions=env.actions,
        rl=RLParameters(**rl_settings),
        training=TrainingParameters(**training_settings))

    device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA,
        gpu_id,
    )
    with core.DeviceScope(device):
        if env.img:
            trainer = DiscreteActionConvTrainer(
                DiscreteActionConvModelParameters(
                    fc_parameters=trainer_params,
                    cnn_parameters=CNNModelParameters(**params['cnn']),
                    num_input_channels=env.num_input_channels,
                    img_height=env.height,
                    img_width=env.width),
                env.normalization,
            )
        else:
            trainer = DiscreteActionTrainer(
                trainer_params,
                env.normalization,
            )
        return run(env, trainer, "{} test run".format(env_type), score_bar,
                   **params["run_details"])
Example #13
0
def run_gym(params, score_bar, gpu_id, save_timesteps_to_dataset=None):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        params["max_replay_memory_size"],
    )
    model_type = params["model_type"]
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id
    )

    if model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels
                )
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions, rl=rl_parameters, training=training_parameters
            )
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(
                trainer_params, env.normalization, env.normalization_action
            )
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params["shared_training"]
        actor_settings = params["actor_training"]
        critic_settings = params["critic_training"]
        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )

        # DDPG can handle continuous and discrete action spaces
        if env.action_type == EnvType.CONTINUOUS_ACTION:
            action_range = env.action_space.high
        else:
            action_range = None

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            use_gpu=False,
            action_range=action_range,
        )

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return run(
        c2_device,
        env,
        model_type,
        trainer,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
    )
Example #14
0
def create_trainer(model_type, params, rl_parameters, use_gpu, env):
    c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU)

    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels
            )
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters,
        )
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_parameters = params["training"]
            if isinstance(training_parameters, dict):
                training_parameters = TrainingParameters(**training_parameters)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels
                )
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions, rl=rl_parameters, training=training_parameters
            )
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            knn=KnnParameters(model_type="DQN"),
            rainbow=rainbow_parameters,
        )
        trainer = ParametricDQNTrainer(
            trainer_params, env.normalization, env.normalization_action, use_gpu
        )
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_parameters = params["training"]
            if isinstance(training_parameters, dict):
                training_parameters = TrainingParameters(**training_parameters)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(
                trainer_params, env.normalization, env.normalization_action
            )
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_parameters = params["shared_training"]
        if isinstance(training_parameters, dict):
            training_parameters = DDPGTrainingParameters(**training_parameters)

        actor_parameters = params["actor_training"]
        if isinstance(actor_parameters, dict):
            actor_parameters = DDPGNetworkParameters(**actor_parameters)

        critic_parameters = params["critic_training"]
        if isinstance(critic_parameters, dict):
            critic_parameters = DDPGNetworkParameters(**critic_parameters)

        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=training_parameters,
            actor_training=actor_parameters,
            critic_training=critic_parameters,
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return trainer
Example #15
0
def run_gym(
    params,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
    batch_rl_file_path=None,
):

    # Caffe2 core uses the min of caffe2_log_level and minloglevel
    # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info.
    core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"])

    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        params["max_replay_memory_size"],
        rl_parameters.gamma,
    )
    model_type = params["model_type"]
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id)
    use_gpu = gpu_id != USE_CPU

    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"])
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels)
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters)
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (training_parameters.cnn_parameters
                        is not None), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"])
                training_parameters.cnn_parameters.conv_dims[
                    0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels)
            else:
                assert (training_parameters.cnn_parameters is
                        None), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions,
                rl=rl_parameters,
                training=training_parameters)
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"])
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            knn=KnnParameters(model_type="DQN"),
        )
        trainer = ParametricDQNTrainer(trainer_params, env.normalization,
                                       env.normalization_action, use_gpu)
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (training_parameters.cnn_parameters
                        is not None), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"])
                training_parameters.cnn_parameters.conv_dims[
                    0] = env.num_input_channels
            else:
                assert (training_parameters.cnn_parameters is
                        None), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(trainer_params,
                                                 env.normalization,
                                                 env.normalization_action)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params["shared_training"]
        actor_settings = params["actor_training"]
        critic_settings = params["critic_training"]
        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return run(
        c2_device,
        env,
        model_type,
        trainer,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
        batch_rl_file_path=batch_rl_file_path,
    )
Example #16
0
def run_gym(params, score_bar, gpu_id):
    rl_settings = params['rl']
    rl_settings['gamma'] = rl_settings['reward_discount_factor']
    del rl_settings['reward_discount_factor']

    env_type = params['env']
    env = OpenAIGymEnvironment(env_type, rl_settings['epsilon'])
    model_type = params['model_type']
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA,
        gpu_id,
    )

    if model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params['training']
            training_settings['gamma'] = training_settings[
                'learning_rate_decay']
            del training_settings['learning_rate_decay']
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions,
                rl=RLParameters(**rl_settings),
                training=TrainingParameters(**training_settings))
            if env.img:
                trainer = DiscreteActionConvTrainer(
                    DiscreteActionConvModelParameters(
                        fc_parameters=trainer_params,
                        cnn_parameters=CNNModelParameters(**params['cnn']),
                        num_input_channels=env.num_input_channels,
                        img_height=env.height,
                        img_width=env.width),
                    env.normalization,
                )
            else:
                trainer = DiscreteActionTrainer(
                    trainer_params,
                    env.normalization,
                )
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params['training']
            training_settings['gamma'] = training_settings[
                'learning_rate_decay']
            del training_settings['learning_rate_decay']
            trainer_params = ContinuousActionModelParameters(
                rl=RLParameters(**rl_settings),
                training=TrainingParameters(**training_settings),
                knn=KnnParameters(model_type='DQN', ),
            )
            trainer = ContinuousActionDQNTrainer(trainer_params,
                                                 env.normalization,
                                                 env.normalization_action)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params['shared_training']
        training_settings['gamma'] = training_settings['learning_rate_decay']
        del training_settings['learning_rate_decay']
        actor_settings = params['actor_training']
        critic_settings = params['critic_training']
        trainer_params = DDPGModelParameters(
            rl=DDPGRLParameters(**rl_settings),
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )
        trainer = DDPGTrainer(
            trainer_params,
            EnvDetails(
                state_dim=env.state_dim,
                action_dim=env.action_dim,
                action_range=(env.action_space.low, env.action_space.high),
            ))
    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return run(env, model_type, trainer, "{} test run".format(env_type),
               score_bar, **params["run_details"])