Ejemplo n.º 1
0
 def get_sarsa_trainer_reward_boost(self, environment, reward_shape):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=1.0,
         reward_burnin=10,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, -1],
         activations=["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.125,
         optimizer="ADAM",
     )
     return DiscreteActionTrainer(
         DiscreteActionModelParameters(
             actions=environment.ACTIONS,
             rl=rl_parameters,
             training=training_parameters,
             rainbow=RainbowDQNParameters(double_q_learning=True,
                                          dueling_architecture=False),
             in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1),
         ),
         environment.normalization,
     )
Ejemplo n.º 2
0
 def get_sarsa_parameters(self, environment, reward_shape, dueling,
                          categorical, clip_grad_norm):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=1.0,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, 128, -1] if dueling else [-1, -1],
         activations=["relu", "linear"] if dueling else ["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.05,
         optimizer="ADAM",
         clip_grad_norm=clip_grad_norm,
     )
     return DiscreteActionModelParameters(
         actions=environment.ACTIONS,
         rl=rl_parameters,
         training=training_parameters,
         rainbow=RainbowDQNParameters(
             double_q_learning=True,
             dueling_architecture=dueling,
             categorical=categorical,
         ),
     )
Ejemplo n.º 3
0
 def get_sarsa_trainer_reward_boost(
     self,
     environment,
     reward_shape,
     dueling,
     use_gpu=False,
     use_all_avail_gpus=False,
 ):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=1.0,
         reward_burnin=10,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, 128, -1] if dueling else [-1, -1],
         activations=["relu", "linear"] if dueling else ["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.05,
         optimizer="ADAM",
     )
     return DQNTrainer(
         DiscreteActionModelParameters(
             actions=environment.ACTIONS,
             rl=rl_parameters,
             training=training_parameters,
             rainbow=RainbowDQNParameters(
                 double_q_learning=True, dueling_architecture=dueling
             ),
         ),
         environment.normalization,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
     )
Ejemplo n.º 4
0
 def get_sarsa_parameters_factorized(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(
             gamma=DISCOUNT,
             target_update_rate=1.0,
             reward_burnin=100,
             maxq_learning=False,
         ),
         training=TrainingParameters(
             # These are used by reward network
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "linear"],
             factorization_parameters=FactorizationParameters(
                 state=FeedForwardParameters(layers=[-1, 128, 64],
                                             activations=["relu",
                                                          "linear"]),
                 action=FeedForwardParameters(
                     layers=[-1, 128, 64], activations=["relu", "linear"]),
             ),
             minibatch_size=self.minibatch_size,
             learning_rate=0.03,
             optimizer="ADAM",
         ),
         rainbow=RainbowDQNParameters(double_q_learning=True,
                                      dueling_architecture=False),
         in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1),
     )
Ejemplo n.º 5
0
    def test_trainer_maxq(self):
        env = Env(self.state_dims, self.action_dims)
        env.seed(42)
        maxq_parameters = DiscreteActionModelParameters(
            actions=env.actions,
            rl=RLParameters(
                gamma=0.99,
                target_update_rate=0.9,
                reward_burnin=100,
                maxq_learning=True,
            ),
            rainbow=RainbowDQNParameters(double_q_learning=True,
                                         dueling_architecture=False),
            training=TrainingParameters(
                layers=self.layers,
                activations=self.activations,
                minibatch_size=self.minibatch_size,
                learning_rate=0.25,
                optimizer="ADAM",
            ),
        )
        maxq_trainer = DQNTrainer(maxq_parameters, env.normalization)

        logger.info("Generating constant_reward MDPs..")

        states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions = env.generate_samples_discrete(
            self.num_samples)

        logger.info("Preprocessing constant_reward MDPs..")

        for epoch in range(self.epochs):
            tdps = env.preprocess_samples_discrete(
                states,
                actions,
                rewards,
                next_states,
                next_actions,
                is_terminal,
                possible_actions,
                possible_next_actions,
                self.minibatch_size,
            )
            logger.info("Training.. " + str(epoch))
            for tdp in tdps:
                maxq_trainer.train(tdp)
            logger.info(" ".join([
                "Training epoch",
                str(epoch),
                "average q values",
                str(torch.mean(maxq_trainer.all_action_scores)),
            ]))

        # Q value should converge to very close to 100
        avg_q_value_after_training = torch.mean(maxq_trainer.all_action_scores)

        self.assertLess(avg_q_value_after_training, 101)
        self.assertGreater(avg_q_value_after_training, 99)
Ejemplo n.º 6
0
def main(params):
    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    model_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters)
    state_normalization = BaseWorkflow.read_norm_file(
        params["state_norm_data_path"])
    action_normalization = BaseWorkflow.read_norm_file(
        params["action_norm_data_path"])

    writer = SummaryWriter(log_dir=params["model_output_path"])
    logger.info("TensorBoard logging location is: {}".format(writer.log_dir))

    preprocess_handler = ParametricDqnPreprocessHandler(
        Preprocessor(state_normalization, False),
        Preprocessor(action_normalization, False),
        PandasSparseToDenseProcessor(),
    )

    workflow = ParametricDqnWorkflow(
        model_params,
        preprocess_handler,
        state_normalization,
        action_normalization,
        params["use_gpu"],
        params["use_all_avail_gpus"],
    )

    train_dataset = JSONDatasetReader(
        params["training_data_path"],
        batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDatasetReader(params["eval_data_path"], batch_size=16)

    with summary_writer_context(writer):
        workflow.train_network(train_dataset, eval_dataset,
                               int(params["epochs"]))

    exporter = ParametricDQNExporter(
        workflow.trainer.q_network,
        PredictorFeatureExtractor(
            state_normalization_parameters=state_normalization,
            action_normalization_parameters=action_normalization,
        ),
        ParametricActionOutputTransformer(),
    )
    return export_trainer_and_predictor(workflow.trainer,
                                        params["model_output_path"],
                                        exporter=exporter)  # noqa
    def test_minibatches_per_step(self):
        _epochs = self.epochs
        self.epochs = 2
        rl_parameters = RLParameters(gamma=0.95,
                                     target_update_rate=0.9,
                                     maxq_learning=True)
        rainbow_parameters = RainbowDQNParameters(double_q_learning=True,
                                                  dueling_architecture=False)
        training_parameters1 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=1024,
            minibatches_per_step=1,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        training_parameters2 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=128,
            minibatches_per_step=8,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        env1 = Env(self.state_dims, self.action_dims)
        env2 = Env(self.state_dims, self.action_dims)
        model_parameters1 = DiscreteActionModelParameters(
            actions=env1.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters1,
        )
        model_parameters2 = DiscreteActionModelParameters(
            actions=env2.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters2,
        )
        # minibatch_size / 8, minibatches_per_step * 8 should give the same result
        logger.info("Training model 1")
        trainer1 = self._train(model_parameters1, env1)
        SummaryWriterContext._reset_globals()
        logger.info("Training model 2")
        trainer2 = self._train(model_parameters2, env2)

        weight1 = trainer1.q_network.fc.layers[-1].weight.detach().numpy()
        weight2 = trainer2.q_network.fc.layers[-1].weight.detach().numpy()

        # Due to numerical stability this tolerance has to be fairly high
        self.assertTrue(np.allclose(weight1, weight2, rtol=0.0, atol=1e-3))
        self.epochs = _epochs
 def get_sarsa_parameters(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=1.0,
                         maxq_learning=False),
         training=TrainingParameters(
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "linear"],
             minibatch_size=self.minibatch_size,
             learning_rate=0.05,
             optimizer="ADAM",
         ),
         rainbow=RainbowDQNParameters(double_q_learning=True,
                                      dueling_architecture=False),
     )
Ejemplo n.º 9
0
def train_network(params):
    logger.info("Running DQN workflow with params:")
    logger.info(params)

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    norm_data = JSONDataset(params["state_norm_data_path"])
    state_normalization = read_norm_params(norm_data.read_all())

    num_batches = int(len(dataset) / training_parameters.minibatch_size)

    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = DQNTrainer(trainer_params, state_normalization,
                         params["use_gpu"])

    for epoch in range(params["epochs"]):
        for batch_idx in range(num_batches):
            helpers.report_training_status(batch_idx, num_batches, epoch,
                                           params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(action_names, batch,
                                                state_normalization)
            trainer.train(tdp)

    logger.info("Training finished. Saving PyTorch model to {}".format(
        params["pytorch_output_path"]))
    helpers.save_model_to_file(trainer, params["pytorch_output_path"])
Ejemplo n.º 10
0
 def get_sarsa_parameters(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(
             gamma=DISCOUNT,
             target_update_rate=1.0,
             reward_burnin=100,
             maxq_learning=False,
         ),
         training=TrainingParameters(
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "linear"],
             minibatch_size=self.minibatch_size,
             learning_rate=0.05,
             optimizer="ADAM",
         ),
         knn=KnnParameters(model_type="DQN"),
         rainbow=RainbowDQNParameters(double_q_learning=True,
                                      dueling_architecture=False),
         in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1),
     )
Ejemplo n.º 11
0
    def test_trainer_maxq(self):
        env = Env(self.state_dims, self.action_dims)
        maxq_parameters = DiscreteActionModelParameters(
            actions=env.actions,
            rl=RLParameters(gamma=0.95,
                            target_update_rate=0.9,
                            maxq_learning=True),
            rainbow=RainbowDQNParameters(double_q_learning=True,
                                         dueling_architecture=False),
            training=TrainingParameters(
                layers=self.layers,
                activations=self.activations,
                minibatch_size=1024,
                learning_rate=0.25,
                optimizer="ADAM",
            ),
        )

        # Q value should converge to very close to 20
        trainer = self._train(maxq_parameters, env)
        avg_q_value_after_training = torch.mean(trainer.all_action_scores)
        self.assertLess(avg_q_value_after_training, 22)
        self.assertGreater(avg_q_value_after_training, 18)
Ejemplo n.º 12
0
def create_trainer(model_type, params, rl_parameters, use_gpu, env):
    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            if isinstance(training_parameters.cnn_parameters, dict):
                training_parameters.cnn_parameters = CNNParameters(
                    **training_parameters.cnn_parameters
                )
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels
            )
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters,
        )
        trainer = create_dqn_trainer_from_params(
            trainer_params, env.normalization, use_gpu
        )

    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters
        )
        trainer = create_parametric_dqn_trainer_from_params(
            trainer_params, env.normalization, env.normalization_action, use_gpu
        )

    elif model_type == ModelType.TD3.value:
        trainer_params = TD3ModelParameters(
            rl=rl_parameters,
            training=TD3TrainingParameters(
                minibatch_size=params["td3_training"]["minibatch_size"],
                q_network_optimizer=OptimizerParameters(
                    **params["td3_training"]["q_network_optimizer"]
                ),
                actor_network_optimizer=OptimizerParameters(
                    **params["td3_training"]["actor_network_optimizer"]
                ),
                use_2_q_functions=params["td3_training"]["use_2_q_functions"],
                exploration_noise=params["td3_training"]["exploration_noise"],
                initial_exploration_ts=params["td3_training"]["initial_exploration_ts"],
                target_policy_smoothing=params["td3_training"][
                    "target_policy_smoothing"
                ],
                noise_clip=params["td3_training"]["noise_clip"],
                delayed_policy_update=params["td3_training"]["delayed_policy_update"],
            ),
            q_network=FeedForwardParameters(**params["td3_q_training"]),
            actor_network=FeedForwardParameters(**params["td3_actor_training"]),
        )
        trainer = get_td3_trainer(env, trainer_params, use_gpu)

    elif model_type == ModelType.SOFT_ACTOR_CRITIC.value:
        value_network = None
        value_network_optimizer = None
        alpha_optimizer = None
        if params["sac_training"]["use_value_network"]:
            value_network = FeedForwardParameters(**params["sac_value_training"])
            value_network_optimizer = OptimizerParameters(
                **params["sac_training"]["value_network_optimizer"]
            )
        if "alpha_optimizer" in params["sac_training"]:
            alpha_optimizer = OptimizerParameters(
                **params["sac_training"]["alpha_optimizer"]
            )
        entropy_temperature = params["sac_training"].get("entropy_temperature", None)
        target_entropy = params["sac_training"].get("target_entropy", None)

        trainer_params = SACModelParameters(
            rl=rl_parameters,
            training=SACTrainingParameters(
                minibatch_size=params["sac_training"]["minibatch_size"],
                use_2_q_functions=params["sac_training"]["use_2_q_functions"],
                use_value_network=params["sac_training"]["use_value_network"],
                q_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["q_network_optimizer"]
                ),
                value_network_optimizer=value_network_optimizer,
                actor_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["actor_network_optimizer"]
                ),
                entropy_temperature=entropy_temperature,
                target_entropy=target_entropy,
                alpha_optimizer=alpha_optimizer,
            ),
            q_network=FeedForwardParameters(**params["sac_q_training"]),
            value_network=value_network,
            actor_network=FeedForwardParameters(**params["sac_actor_training"]),
        )
        trainer = get_sac_trainer(env, trainer_params, use_gpu)

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return trainer
Ejemplo n.º 13
0
def train_network(params):
    writer = None
    if params["model_output_path"] is not None:
        writer = SummaryWriter(log_dir=params["model_output_path"])

    logger.info("Running DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16)
    state_normalization = read_norm_file(params["state_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = DQNTrainer(
        trainer_params,
        state_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    preprocessor = Preprocessor(state_normalization, False)

    evaluator = Evaluator(
        trainer_params.actions,
        trainer_params.rl.gamma,
        trainer,
        metrics_to_score=trainer.metrics_to_score,
    )

    start_time = time.time()
    for epoch in range(int(params["epochs"])):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   int(params["epochs"]))
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(preprocessor, batch,
                                                action_names)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        while True:
            batch = eval_dataset.read_batch(batch_idx)
            if batch is None:
                break
            tdp = preprocess_batch_for_training(preprocessor, batch,
                                                action_names)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * int(params["epochs"])) / (time.time() -
                                                            start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    if writer is not None:
        writer.close()

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Ejemplo n.º 14
0
def single_process_main(gpu_index, *args):
    params = args[0]
    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"]
    )

    action_names = params["actions"]

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])

    model_params = DiscreteActionModelParameters(
        actions=action_names,
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
    )
    state_normalization = BaseWorkflow.read_norm_file(params["state_norm_data_path"])

    writer = SummaryWriter(log_dir=params["model_output_path"])
    logger.info("TensorBoard logging location is: {}".format(writer.log_dir))

    if params["use_all_avail_gpus"]:
        BaseWorkflow.init_multiprocessing(
            int(params["num_processes_per_node"]),
            int(params["num_nodes"]),
            int(params["node_index"]),
            gpu_index,
            params["init_method"],
        )

    workflow = DqnWorkflow(
        model_params,
        state_normalization,
        params["use_gpu"],
        params["use_all_avail_gpus"],
    )

    sorted_features, _ = sort_features_by_normalization(state_normalization)
    preprocess_handler = DiscreteDqnPreprocessHandler(
        action_names, PandasSparseToDenseProcessor(sorted_features)
    )

    train_dataset = JSONDatasetReader(
        params["training_data_path"],
        batch_size=training_parameters.minibatch_size,
        preprocess_handler=preprocess_handler,
    )
    eval_dataset = JSONDatasetReader(
        params["eval_data_path"],
        batch_size=training_parameters.minibatch_size,
        preprocess_handler=preprocess_handler,
    )

    with summary_writer_context(writer):
        workflow.train_network(train_dataset, eval_dataset, int(params["epochs"]))

    exporter = DQNExporter(
        workflow.trainer.q_network,
        PredictorFeatureExtractor(state_normalization_parameters=state_normalization),
        DiscreteActionOutputTransformer(model_params.actions),
    )

    if int(params["node_index"]) == 0 and gpu_index == 0:
        export_trainer_and_predictor(
            workflow.trainer, params["model_output_path"], exporter=exporter
        )  # noqa
Ejemplo n.º 15
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"])
    else:
        in_training_cpe_parameters = None

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    eval_dataset = JSONDataset(params["eval_data_path"],
                               batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = ParametricDQNTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, False)
    action_preprocessor = Preprocessor(action_normalization, False)

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
            metrics_to_score=trainer.metrics_to_score,
        )
    else:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
            metrics_to_score=trainer.metrics_to_score,
        )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

        eval_dataset.reset_iterator()
        accumulated_edp = None
        for batch_idx in range(num_batches):
            batch = eval_dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)
            edp = EvaluationDataPage.create_from_tdp(tdp, trainer)
            if accumulated_edp is None:
                accumulated_edp = edp
            else:
                accumulated_edp = accumulated_edp.append(edp)
        accumulated_edp = accumulated_edp.compute_values(trainer.gamma)

        cpe_start_time = time.time()
        details = evaluator.evaluate_post_training(accumulated_edp)
        details.log()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Ejemplo n.º 16
0
def train_network(params):
    writer = None
    if params["model_output_path"] is not None:
        writer = SummaryWriter(
            log_dir=os.path.join(
                os.path.expanduser(params["model_output_path"]), "training_data"
            )
        )

    logger.info("Running DQN workflow with params:")
    logger.info(params)

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"]
        )
    else:
        in_training_cpe_parameters = None

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(
        params["training_data_path"], batch_size=training_parameters.minibatch_size
    )
    state_normalization = read_norm_file(params["state_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info(
        "Read in batch data set {} of size {} examples. Data split "
        "into {} batches of size {}.".format(
            params["training_data_path"],
            len(dataset),
            num_batches,
            training_parameters.minibatch_size,
        )
    )

    trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"])
    trainer = update_model_for_warm_start(trainer)
    preprocessor = Preprocessor(state_normalization, params["use_gpu"])

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            trainer_params.actions,
            10,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
        )
    else:
        evaluator = Evaluator(
            trainer_params.actions,
            10,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
        )

    start_time = time.time()
    for epoch in range(int(params["epochs"])):
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch, int(params["epochs"]))
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(preprocessor, batch, action_names)

            trainer.train(tdp)

            trainer.evaluate(
                evaluator, tdp.actions, None, tdp.rewards, tdp.episode_values
            )

            evaluator.collect_discrete_action_samples(
                mdp_ids=tdp.mdp_ids,
                sequence_numbers=tdp.sequence_numbers.cpu().numpy(),
                states=tdp.states.cpu().numpy(),
                logged_actions=tdp.actions.cpu().numpy(),
                logged_rewards=tdp.rewards.cpu().numpy(),
                logged_propensities=tdp.propensities.cpu().numpy(),
                logged_terminals=np.invert(
                    tdp.not_terminals.cpu().numpy().astype(np.bool)
                ),
            )

        cpe_start_time = time.time()
        evaluator.recover_samples_to_be_unshuffled()
        evaluator.score_cpe()
        if writer is not None:
            evaluator.log_to_tensorboard(writer, epoch)
        evaluator.clear_collected_samples()
        logger.info(
            "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)
        )

    through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time)
    logger.info(
        "Training finished. Processed ~{} examples / s.".format(round(through_put))
    )

    if writer is not None:
        writer.close()

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Ejemplo n.º 17
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"])
    else:
        in_training_cpe_parameters = None

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = ParametricDQNTrainer(trainer_params, state_normalization,
                                   action_normalization, params["use_gpu"])
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, params["use_gpu"])
    action_preprocessor = Preprocessor(action_normalization, params["use_gpu"])

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            None,
            100,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
        )
    else:
        evaluator = Evaluator(
            None,
            100,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
        )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)

            trainer.train(tdp)

            trainer.evaluate(evaluator, None, None, tdp.episode_values)
            evaluator.collect_parametric_action_samples(
                mdp_ids=tdp.mdp_ids,
                sequence_numbers=tdp.sequence_numbers,
                logged_state_actions=np.concatenate(
                    (tdp.states.cpu().numpy(), tdp.actions.cpu().numpy()),
                    axis=1),
                logged_rewards=tdp.rewards,
                logged_propensities=tdp.propensities,
                logged_terminals=(1.0 - tdp.not_terminals),
                possible_state_actions=tdp.state_pas_concat.cpu().numpy(),
                pas_lens=tdp.possible_actions_lengths,
            )

        cpe_start_time = time.time()
        evaluator.recover_samples_to_be_unshuffled()
        evaluator.score_cpe()
        evaluator.clear_collected_samples()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Ejemplo n.º 18
0
def create_trainer(model_type, params, rl_parameters, use_gpu, env):
    c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU)

    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels
            )
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters,
        )
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_parameters = params["training"]
            if isinstance(training_parameters, dict):
                training_parameters = TrainingParameters(**training_parameters)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels
                )
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions, rl=rl_parameters, training=training_parameters
            )
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            knn=KnnParameters(model_type="DQN"),
            rainbow=rainbow_parameters,
        )
        trainer = ParametricDQNTrainer(
            trainer_params, env.normalization, env.normalization_action, use_gpu
        )
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_parameters = params["training"]
            if isinstance(training_parameters, dict):
                training_parameters = TrainingParameters(**training_parameters)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(
                trainer_params, env.normalization, env.normalization_action
            )
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_parameters = params["shared_training"]
        if isinstance(training_parameters, dict):
            training_parameters = DDPGTrainingParameters(**training_parameters)

        actor_parameters = params["actor_training"]
        if isinstance(actor_parameters, dict):
            actor_parameters = DDPGNetworkParameters(**actor_parameters)

        critic_parameters = params["critic_training"]
        if isinstance(critic_parameters, dict):
            critic_parameters = DDPGNetworkParameters(**critic_parameters)

        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=training_parameters,
            actor_training=actor_parameters,
            critic_training=critic_parameters,
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return trainer
Ejemplo n.º 19
0
def create_park_trainer(model_type, params, rl_parameters, use_gpu, env):
    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            if isinstance(training_parameters.cnn_parameters, dict):
                training_parameters.cnn_parameters = CNNParameters(
                    **training_parameters.cnn_parameters)
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels)
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters,
        )
        trainer = create_park_dqn_trainer_from_params(
            model=trainer_params,
            normalization_parameters=env.normalization,
            use_gpu=use_gpu,
            env=env.env)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters)
        trainer = create_parametric_dqn_trainer_from_params(
            trainer_params,
            env.normalization,
            env.normalization_action,
            use_gpu,
            env=env.env)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_parameters = params["shared_training"]
        if isinstance(training_parameters, dict):
            training_parameters = DDPGTrainingParameters(**training_parameters)

        actor_parameters = params["actor_training"]
        if isinstance(actor_parameters, dict):
            actor_parameters = DDPGNetworkParameters(**actor_parameters)

        critic_parameters = params["critic_training"]
        if isinstance(critic_parameters, dict):
            critic_parameters = DDPGNetworkParameters(**critic_parameters)

        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=training_parameters,
            actor_training=actor_parameters,
            critic_training=critic_parameters,
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        state_dim = get_num_output_features(env.normalization)
        action_dim = get_num_output_features(env.normalization_action)

        # Build Actor Network
        actor_network = ActorNetModel(
            layers=([state_dim] + trainer_params.actor_training.layers[1:-1] +
                    [action_dim]),
            activations=trainer_params.actor_training.activations,
            fl_init=trainer_params.shared_training.final_layer_init,
            state_dim=state_dim,
            action_dim=action_dim,
            use_gpu=use_gpu,
            use_all_avail_gpus=False,
        )

        # Build Critic Network
        critic_network = CriticNetModel(
            # Ensure dims match input state and scalar output
            layers=[state_dim] + \
            trainer_params.critic_training.layers[1:-1] + [1],
            activations=trainer_params.critic_training.activations,
            fl_init=trainer_params.shared_training.final_layer_init,
            state_dim=state_dim,
            action_dim=action_dim,
            use_gpu=use_gpu,
            use_all_avail_gpus=False,
        )

        trainer = DDPGTrainer(
            actor_network,
            critic_network,
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    elif model_type == ModelType.SOFT_ACTOR_CRITIC.value:
        value_network = None
        value_network_optimizer = None
        if params["sac_training"]["use_value_network"]:
            value_network = FeedForwardParameters(
                **params["sac_value_training"])
            value_network_optimizer = OptimizerParameters(
                **params["sac_training"]["value_network_optimizer"])

        trainer_params = SACModelParameters(
            rl=rl_parameters,
            training=SACTrainingParameters(
                minibatch_size=params["sac_training"]["minibatch_size"],
                use_2_q_functions=params["sac_training"]["use_2_q_functions"],
                use_value_network=params["sac_training"]["use_value_network"],
                q_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["q_network_optimizer"]),
                value_network_optimizer=value_network_optimizer,
                actor_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["actor_network_optimizer"]),
                entropy_temperature=params["sac_training"]
                ["entropy_temperature"],
            ),
            q_network=FeedForwardParameters(**params["sac_q_training"]),
            value_network=value_network,
            actor_network=FeedForwardParameters(
                **params["sac_actor_training"]),
        )
        trainer = horizon_runner.get_sac_trainer(env, trainer_params, use_gpu)

    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return trainer
Ejemplo n.º 20
0
def create_trainer(model_type, params, rl_parameters, use_gpu, env):
    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            if isinstance(training_parameters.cnn_parameters, dict):
                training_parameters.cnn_parameters = CNNParameters(
                    **training_parameters.cnn_parameters)
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels)
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions,
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters,
        )
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_parameters = params["training"]
        if isinstance(training_parameters, dict):
            training_parameters = TrainingParameters(**training_parameters)
        rainbow_parameters = params["rainbow"]
        if isinstance(rainbow_parameters, dict):
            rainbow_parameters = RainbowDQNParameters(**rainbow_parameters)
        if env.img:
            assert (training_parameters.cnn_parameters
                    is not None), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters.conv_dims[
                0] = env.num_input_channels
        else:
            assert (training_parameters.cnn_parameters is
                    None), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            rainbow=rainbow_parameters)
        trainer = ParametricDQNTrainer(trainer_params, env.normalization,
                                       env.normalization_action, use_gpu)
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_parameters = params["shared_training"]
        if isinstance(training_parameters, dict):
            training_parameters = DDPGTrainingParameters(**training_parameters)

        actor_parameters = params["actor_training"]
        if isinstance(actor_parameters, dict):
            actor_parameters = DDPGNetworkParameters(**actor_parameters)

        critic_parameters = params["critic_training"]
        if isinstance(critic_parameters, dict):
            critic_parameters = DDPGNetworkParameters(**critic_parameters)

        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=training_parameters,
            actor_training=actor_parameters,
            critic_training=critic_parameters,
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    elif model_type == ModelType.SOFT_ACTOR_CRITIC.value:
        trainer_params = SACModelParameters(
            rl=rl_parameters,
            training=SACTrainingParameters(
                minibatch_size=params["sac_training"]["minibatch_size"],
                use_2_q_functions=params["sac_training"]["use_2_q_functions"],
                q_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["q_network_optimizer"]),
                value_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["value_network_optimizer"]),
                actor_network_optimizer=OptimizerParameters(
                    **params["sac_training"]["actor_network_optimizer"]),
                entropy_temperature=params["sac_training"]
                ["entropy_temperature"],
            ),
            q_network=FeedForwardParameters(**params["sac_q_training"]),
            value_network=FeedForwardParameters(
                **params["sac_value_training"]),
            actor_network=FeedForwardParameters(
                **params["sac_actor_training"]),
        )
        trainer = get_sac_trainer(env, trainer_params, use_gpu)

    else:
        raise NotImplementedError(
            "Model of type {} not supported".format(model_type))

    return trainer
Ejemplo n.º 21
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"])
    else:
        in_training_cpe_parameters = None

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = ParametricDQNTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, False)
    action_preprocessor = Preprocessor(action_normalization, False)

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
            metrics_to_score=trainer.metrics_to_score,
        )
    else:
        evaluator = Evaluator(
            None,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
            metrics_to_score=trainer.metrics_to_score,
        )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

            evaluator.collect_parametric_action_samples(
                mdp_ids=tdp.mdp_ids,
                sequence_numbers=tdp.sequence_numbers.cpu().numpy(),
                logged_actions=tdp.actions.cpu().numpy(),
                logged_possible_actions_mask=tdp.possible_actions_mask.cpu(
                ).numpy(),
                logged_rewards=tdp.rewards.cpu().numpy(),
                logged_propensities=tdp.propensities.cpu().numpy(),
                logged_terminals=(1.0 - tdp.not_terminal),
                possible_state_actions=tdp.possible_actions_state_concat.cpu().
                numpy(),
                num_possible_actions=tdp.possible_actions_mask.shape[1],
                metrics=tdp.rewards.cpu().numpy(
                ),  # Dummy until Parametric CPE on metrics implemented
            )

        cpe_start_time = time.time()
        evaluator.recover_samples_to_be_unshuffled()
        evaluator.score_cpe(trainer_params.rl.gamma)
        evaluator.clear_collected_samples()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])