def get_trainer(self,
                    environment,
                    parameters=None,
                    use_gpu=False,
                    use_all_avail_gpus=False):
        layers = [256, 128]
        activations = ["relu", "relu"]
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=layers,
            activations=activations,
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=layers,
            activations=activations,
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model(
                )

        q_network_target = q_network.get_target_network()
        param_dict = parameters.asdict()  # type: ignore
        trainer = ParametricDQNTrainer(q_network, q_network_target,
                                       reward_network, **param_dict)
        return trainer
Esempio n. 2
0
def create_parametric_dqn_trainer_from_params(
    model: ContinuousActionModelParameters,
    state_normalization_parameters: Dict[int, NormalizationParameters],
    action_normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
):
    q_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    reward_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    q_network_target = q_network.get_target_network()

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()
        q_network_target = q_network_target.cuda()
        reward_network = reward_network.cuda()

    if use_all_avail_gpus:
        q_network = q_network.get_distributed_data_parallel_model()
        q_network_target = q_network_target.get_distributed_data_parallel_model(
        )
        reward_network = reward_network.get_distributed_data_parallel_model()

    return ParametricDQNTrainer(q_network, q_network_target, reward_network,
                                model, use_gpu)
Esempio n. 3
0
    def get_trainer(
        self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False
    ):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = ParametricDQNTrainer(
            q_network, q_network_target, reward_network, parameters
        )
        return trainer
    def __init__(
        self,
        model_params: ContinuousActionModelParameters,
        preprocess_handler: PreprocessHandler,
        state_normalization: Dict[int, NormalizationParameters],
        action_normalization: Dict[int, NormalizationParameters],
        use_gpu: bool,
        use_all_avail_gpus: bool,
    ):
        logger.info("Running Parametric DQN workflow with params:")
        logger.info(model_params)
        model_params = model_params

        trainer = ParametricDQNTrainer(
            model_params,
            state_normalization,
            action_normalization,
            use_gpu=use_gpu,
            use_all_avail_gpus=use_all_avail_gpus,
        )
        trainer = update_model_for_warm_start(trainer)
        assert (type(trainer) == ParametricDQNTrainer
                ), "Warm started wrong model type: " + str(type(trainer))

        evaluator = Evaluator(
            None,
            model_params.rl.gamma,
            trainer,
            metrics_to_score=trainer.metrics_to_score,
        )

        super(ParametricDqnWorkflow,
              self).__init__(preprocess_handler, trainer, evaluator,
                             model_params.training.minibatch_size)
Esempio n. 5
0
 def get_sarsa_trainer_exporter(
     self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False
 ):
     parameters = parameters or self.get_sarsa_parameters()
     trainer = ParametricDQNTrainer(
         parameters,
         environment.normalization,
         environment.normalization_action,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
     )
     return (trainer, trainer)
Esempio n. 6
0
def create_parametric_dqn_trainer_from_params(
    model: ContinuousActionModelParameters,
    state_normalization_parameters: Dict[int, NormalizationParameters],
    action_normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
):
    q_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    reward_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    q_network_target = q_network.get_target_network()

    if use_gpu:
        q_network = q_network.cuda()
        q_network_target = q_network_target.cuda()
        reward_network = reward_network.cuda()

    if use_all_avail_gpus:
        q_network = q_network.get_distributed_data_parallel_model()
        q_network_target = q_network_target.get_distributed_data_parallel_model(
        )
        reward_network = reward_network.get_distributed_data_parallel_model()

    trainer_parameters = ParametricDQNTrainerParameters(  # type: ignore
        rl=model.rl,
        double_q_learning=model.rainbow.double_q_learning,
        minibatch_size=model.training.minibatch_size,
        optimizer=OptimizerParameters(
            optimizer=model.training.optimizer,
            learning_rate=model.training.learning_rate,
            l2_decay=model.training.l2_decay,
        ),
    )

    return ParametricDQNTrainer(
        q_network,
        q_network_target,
        reward_network,
        use_gpu=use_gpu,
        **trainer_parameters.asdict()  # type: ignore
    )
    def get_modular_sarsa_trainer_exporter(self,
                                           environment,
                                           parameters=None,
                                           use_gpu=False,
                                           use_all_avail_gpus=False):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model(
                )

        q_network_target = q_network.get_target_network()
        trainer = ParametricDQNTrainer(q_network, q_network_target,
                                       reward_network, parameters)
        feature_extractor = PredictorFeatureExtractor(
            state_normalization_parameters=environment.normalization,
            action_normalization_parameters=environment.normalization_action,
        )
        output_transformer = ParametricActionOutputTransformer()
        exporter = ParametricDQNExporter(q_network, feature_extractor,
                                         output_transformer)
        return (trainer, exporter)
Esempio n. 8
0
def create_trainer(model_type, params, rl_parameters, use_gpu, env):
    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            if isinstance(training_parameters.cnn_parameters, dict):
                training_parameters.cnn_parameters = CNNParameters(
                    **training_parameters.cnn_parameters)
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels)
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters,
        )
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters)
        trainer = ParametricDQNTrainer(trainer_params, env.normalization,
                                       env.normalization_action, use_gpu)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_parameters = params["shared_training"]
        if isinstance(training_parameters, dict):
            training_parameters = DDPGTrainingParameters(**training_parameters)

        actor_parameters = params["actor_training"]
        if isinstance(actor_parameters, dict):
            actor_parameters = DDPGNetworkParameters(**actor_parameters)

        critic_parameters = params["critic_training"]
        if isinstance(critic_parameters, dict):
            critic_parameters = DDPGNetworkParameters(**critic_parameters)

        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=training_parameters,
            actor_training=actor_parameters,
            critic_training=critic_parameters,
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    elif model_type == ModelType.SOFT_ACTOR_CRITIC.value:
        trainer_params = SACModelParameters(
            rl=rl_parameters,
            training=SACTrainingParameters(
                minibatch_size=params["sac_training"]["minibatch_size"],
                use_2_q_functions=params["sac_training"]["use_2_q_functions"],
                q_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["q_network_optimizer"]),
                value_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["value_network_optimizer"]),
                actor_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["actor_network_optimizer"]),
                entropy_temperature=params["sac_training"]
                ["entropy_temperature"],
            ),
            q_network=FeedForwardParameters(**params["sac_q_training"]),
            value_network=FeedForwardParameters(
                **params["sac_value_training"]),
            actor_network=FeedForwardParameters(
                **params["sac_actor_training"]),
        )
        trainer = get_sac_trainer(env, trainer_params, use_gpu)

    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return trainer
Esempio n. 9
0
    def create_from_tensors_parametric_dqn(
        cls,
        trainer: ParametricDQNTrainer,
        mdp_ids: np.ndarray,
        sequence_numbers: torch.Tensor,
        states: rlt.PreprocessedFeatureVector,
        actions: rlt.PreprocessedFeatureVector,
        propensities: torch.Tensor,
        rewards: torch.Tensor,
        possible_actions_mask: torch.Tensor,
        possible_actions: rlt.PreprocessedFeatureVector,
        max_num_actions: int,
        metrics: Optional[torch.Tensor] = None,
    ):
        old_q_train_state = trainer.q_network.training
        old_reward_train_state = trainer.reward_network.training
        trainer.q_network.train(False)
        trainer.reward_network.train(False)

        state_action_pairs = rlt.PreprocessedStateAction(state=states, action=actions)
        tiled_state = states.float_features.repeat(1, max_num_actions).reshape(
            -1, states.float_features.shape[1]
        )
        assert possible_actions is not None
        # Get Q-value of action taken
        possible_actions_state_concat = rlt.PreprocessedStateAction(
            state=rlt.PreprocessedFeatureVector(float_features=tiled_state),
            action=possible_actions,
        )

        # FIXME: model_values, model_values_for_logged_action, and model_metrics_values
        # should be calculated using q_network_cpe (as in discrete dqn).
        # q_network_cpe has not been added in parametric dqn yet.
        model_values = trainer.q_network(
            possible_actions_state_concat
        ).q_value  # type: ignore
        optimal_q_values, _ = trainer.get_detached_q_values(
            possible_actions_state_concat.state, possible_actions_state_concat.action
        )
        eval_action_idxs = None

        assert (
            model_values.shape[1] == 1
            and model_values.shape[0]
            == possible_actions_mask.shape[0] * possible_actions_mask.shape[1]
        ), (
            "Invalid shapes: "
            + str(model_values.shape)
            + " != "
            + str(possible_actions_mask.shape)
        )
        model_values = model_values.reshape(possible_actions_mask.shape)
        optimal_q_values = optimal_q_values.reshape(possible_actions_mask.shape)
        model_propensities = masked_softmax(
            optimal_q_values, possible_actions_mask, trainer.rl_temperature
        )

        rewards_and_metric_rewards = trainer.reward_network(
            possible_actions_state_concat
        ).q_value  # type: ignore
        model_rewards = rewards_and_metric_rewards[:, :1]
        assert (
            model_rewards.shape[0] * model_rewards.shape[1]
            == possible_actions_mask.shape[0] * possible_actions_mask.shape[1]
        ), (
            "Invalid shapes: "
            + str(model_rewards.shape)
            + " != "
            + str(possible_actions_mask.shape)
        )
        model_rewards = model_rewards.reshape(possible_actions_mask.shape)

        model_metrics = rewards_and_metric_rewards[:, 1:]
        model_metrics = model_metrics.reshape(possible_actions_mask.shape[0], -1)

        model_values_for_logged_action = trainer.q_network(state_action_pairs).q_value
        model_rewards_and_metrics_for_logged_action = trainer.reward_network(
            state_action_pairs
        ).q_value
        model_rewards_for_logged_action = model_rewards_and_metrics_for_logged_action[
            :, :1
        ]

        action_dim = possible_actions.float_features.shape[1]
        action_mask = torch.all(
            possible_actions.float_features.view(-1, max_num_actions, action_dim)
            == actions.float_features.unsqueeze(dim=1),
            dim=2,
        ).float()
        assert torch.all(action_mask.sum(dim=1) == 1)
        num_metrics = model_metrics.shape[1] // max_num_actions

        model_metrics_values = None
        model_metrics_for_logged_action = None
        model_metrics_values_for_logged_action = None
        if num_metrics > 0:
            # FIXME: calculate model_metrics_values when q_network_cpe is added
            # to parametric dqn
            model_metrics_values = model_values.repeat(1, num_metrics)

        trainer.q_network.train(old_q_train_state)  # type: ignore
        trainer.reward_network.train(old_reward_train_state)  # type: ignore

        return cls(
            mdp_id=mdp_ids,
            sequence_number=sequence_numbers,
            logged_propensities=propensities,
            logged_rewards=rewards,
            action_mask=action_mask,
            model_rewards=model_rewards,
            model_rewards_for_logged_action=model_rewards_for_logged_action,
            model_values=model_values,
            model_values_for_logged_action=model_values_for_logged_action,
            model_metrics_values=model_metrics_values,
            model_metrics_values_for_logged_action=model_metrics_values_for_logged_action,
            model_propensities=model_propensities,
            logged_metrics=metrics,
            model_metrics=model_metrics,
            model_metrics_for_logged_action=model_metrics_for_logged_action,
            # Will compute later
            logged_values=None,
            logged_metrics_values=None,
            possible_actions_mask=possible_actions_mask,
            optimal_q_values=optimal_q_values,
            eval_action_idxs=eval_action_idxs,
        )
Esempio n. 10
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"])
    else:
        in_training_cpe_parameters = None

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = ParametricDQNTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, params["use_gpu"])
    action_preprocessor = Preprocessor(action_normalization, params["use_gpu"])

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            None,
            100,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
        )
    else:
        evaluator = Evaluator(
            None,
            100,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
        )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp, evaluator)

            evaluator.collect_parametric_action_samples(
                mdp_ids=tdp.mdp_ids,
                sequence_numbers=tdp.sequence_numbers.cpu().numpy(),
                logged_state_actions=np.concatenate(
                    (tdp.states.cpu().numpy(), tdp.actions.cpu().numpy()),
                    axis=1),
                logged_rewards=tdp.rewards.cpu().numpy(),
                logged_propensities=tdp.propensities.cpu().numpy(),
                logged_terminals=(1.0 - tdp.not_terminals),
                possible_state_actions=tdp.state_pas_concat.cpu().numpy(),
                pas_lens=tdp.possible_actions_lengths.cpu().numpy(),
            )

        cpe_start_time = time.time()
        evaluator.recover_samples_to_be_unshuffled()
        evaluator.score_cpe(trainer_params.rl.gamma)
        evaluator.clear_collected_samples()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Esempio n. 11
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"])
    else:
        in_training_cpe_parameters = None

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDataset(params["eval_data_path"],
                               batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = ParametricDQNTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, False)
    action_preprocessor = Preprocessor(action_normalization, False)

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
            metrics_to_score=trainer.metrics_to_score,
        )
    else:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
            metrics_to_score=trainer.metrics_to_score,
        )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        for batch_idx in range(num_batches):
            batch = eval_dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Esempio n. 12
0
def create_trainer(model_type, params, rl_parameters, use_gpu, env):
    c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU)

    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels
            )
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters,
        )
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_parameters = params["training"]
            if isinstance(training_parameters, dict):
                training_parameters = TrainingParameters(**training_parameters)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels
                )
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions, rl=rl_parameters, training=training_parameters
            )
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            knn=KnnParameters(model_type="DQN"),
            rainbow=rainbow_parameters,
        )
        trainer = ParametricDQNTrainer(
            trainer_params, env.normalization, env.normalization_action, use_gpu
        )
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_parameters = params["training"]
            if isinstance(training_parameters, dict):
                training_parameters = TrainingParameters(**training_parameters)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(
                trainer_params, env.normalization, env.normalization_action
            )
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_parameters = params["shared_training"]
        if isinstance(training_parameters, dict):
            training_parameters = DDPGTrainingParameters(**training_parameters)

        actor_parameters = params["actor_training"]
        if isinstance(actor_parameters, dict):
            actor_parameters = DDPGNetworkParameters(**actor_parameters)

        critic_parameters = params["critic_training"]
        if isinstance(critic_parameters, dict):
            critic_parameters = DDPGNetworkParameters(**critic_parameters)

        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=training_parameters,
            actor_training=actor_parameters,
            critic_training=critic_parameters,
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return trainer
Esempio n. 13
0
 def get_sarsa_trainer(self, environment, parameters=None):
     parameters = parameters or self.get_sarsa_parameters()
     return ParametricDQNTrainer(parameters, environment.normalization,
                                 environment.normalization_action)
Esempio n. 14
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"]
    )

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters
    )

    dataset = JSONDataset(
        params["training_data_path"], batch_size=training_parameters.minibatch_size
    )
    eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info(
        "Read in batch data set {} of size {} examples. Data split "
        "into {} batches of size {}.".format(
            params["training_data_path"],
            len(dataset),
            num_batches,
            training_parameters.minibatch_size,
        )
    )

    trainer = ParametricDQNTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, False)
    action_preprocessor = Preprocessor(action_normalization, False)

    evaluator = Evaluator(
        None,
        trainer_params.rl.gamma,
        trainer,
        metrics_to_score=trainer.metrics_to_score,
    )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        batch_idx = -1
        while True:
            batch_idx += 1
            report_training_status(batch_idx, num_batches, epoch, params["epochs"])
            batch = dataset.read_batch()
            if batch is None:
                break
            tdp = preprocess_batch_for_training(
                state_preprocessor, batch, action_preprocessor=action_preprocessor
            )
            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        while True:
            batch = eval_dataset.read_batch()
            if batch is None:
                break
            tdp = preprocess_batch_for_training(
                state_preprocessor, batch, action_preprocessor=action_preprocessor
            )
            tdp.set_type(trainer.dtype)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info(
            "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)
        )

    through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time)
    logger.info(
        "Training finished. Processed ~{} examples / s.".format(round(through_put))
    )

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Esempio n. 15
0
def run_gym(
    params,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
    batch_rl_file_path=None,
):

    # Caffe2 core uses the min of caffe2_log_level and minloglevel
    # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info.
    core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"])

    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        params["max_replay_memory_size"],
        rl_parameters.gamma,
    )
    model_type = params["model_type"]
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id)
    use_gpu = gpu_id != USE_CPU

    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"])
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels)
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters)
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (training_parameters.cnn_parameters
                        is not None), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"])
                training_parameters.cnn_parameters.conv_dims[
                    0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels)
            else:
                assert (training_parameters.cnn_parameters is
                        None), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions,
                rl=rl_parameters,
                training=training_parameters)
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"])
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            knn=KnnParameters(model_type="DQN"),
        )
        trainer = ParametricDQNTrainer(trainer_params, env.normalization,
                                       env.normalization_action, use_gpu)
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (training_parameters.cnn_parameters
                        is not None), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"])
                training_parameters.cnn_parameters.conv_dims[
                    0] = env.num_input_channels
            else:
                assert (training_parameters.cnn_parameters is
                        None), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(trainer_params,
                                                 env.normalization,
                                                 env.normalization_action)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params["shared_training"]
        actor_settings = params["actor_training"]
        critic_settings = params["critic_training"]
        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return run(
        c2_device,
        env,
        model_type,
        trainer,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
        batch_rl_file_path=batch_rl_file_path,
    )
Esempio n. 16
0
 def get_sarsa_trainer(self, environment):
     return ParametricDQNTrainer(
         self.get_sarsa_parameters(),
         environment.normalization,
         environment.normalization_action,
     )