def get_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, use_gpu=False, use_all_avail_gpus=False, ): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=10, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, 128, -1] if dueling else [-1, -1], activations=["relu", "linear"] if dueling else ["linear"], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", ) return DQNTrainer( DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=dueling), in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1), ), environment.normalization, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, )
def get_sarsa_parameters_factorized(self): return ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=100, maxq_learning=False, ), training=TrainingParameters( # These are used by reward network layers=[-1, 256, 128, -1], activations=["relu", "relu", "linear"], factorization_parameters=FactorizationParameters( state=FeedForwardParameters(layers=[-1, 128, 64], activations=["relu", "linear"]), action=FeedForwardParameters( layers=[-1, 128, 64], activations=["relu", "linear"]), ), minibatch_size=self.minibatch_size, learning_rate=0.03, optimizer="ADAM", ), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1), )
def get_ddpg_parameters(self): return DDPGModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=100, maxq_learning=True, ), shared_training=DDPGTrainingParameters( minibatch_size=self.minibatch_size, final_layer_init=0.003, optimizer="ADAM", ), actor_training=DDPGNetworkParameters( layers=[-1, 256, 128, -1], activations=["relu", "relu", "tanh"], learning_rate=0.05, l2_decay=0.01, ), critic_training=DDPGNetworkParameters( layers=[-1, 256, 256, 128, -1], activations=["relu", "relu", "relu", "linear"], learning_rate=0.05, l2_decay=0.01, ), )
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) env.seed(42) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters( gamma=0.99, target_update_rate=1.0, reward_burnin=100, maxq_learning=True, ), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=self.minibatch_size, learning_rate=1.0, optimizer="ADAM", ), ) maxq_trainer = DQNTrainer(maxq_parameters, env.normalization) logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions = env.generate_samples_discrete( self.num_samples) logger.info("Preprocessing constant_reward MDPs..") for epoch in range(self.epochs): tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, self.minibatch_size, ) logger.info("Training.. " + str(epoch)) for tdp in tdps: maxq_trainer.train(tdp, None) logger.info(" ".join([ "Training epoch", str(epoch), "average q values", str(torch.mean(maxq_trainer.all_action_scores)), ])) # Q value should converge to very close to 100 avg_q_value_after_training = torch.mean(maxq_trainer.all_action_scores) self.assertLess(avg_q_value_after_training, 101) self.assertGreater(avg_q_value_after_training, 99)
def get_sac_parameters(self, use_2_q_functions=False): return SACModelParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=100), training=SACTrainingParameters( minibatch_size=self.minibatch_size, use_2_q_functions=use_2_q_functions, q_network_optimizer=OptimizerParameters(), value_network_optimizer=OptimizerParameters(), actor_network_optimizer=OptimizerParameters(), ), q_network=FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]), value_network=FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]), actor_network=FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]), )
def run_gym( params, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, ): logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, ) replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) model_type = params["model_type"] use_gpu = gpu_id != USE_CPU trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env) predictor = create_predictor(trainer, model_type, use_gpu) c2_device = core.DeviceOption( caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id)) return train_sgd( c2_device, env, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, )
def train_network(params): logger.info("Running Parametric DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["shared_training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = DDPGTrainingParameters(**params["shared_training"]) actor_parameters = DDPGNetworkParameters(**params["actor_training"]) critic_parameters = DDPGNetworkParameters(**params["critic_training"]) trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=training_parameters, actor_training=actor_parameters, critic_training=critic_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) state_normalization = read_norm_file(params["state_norm_data_path"]) action_normalization = read_norm_file(params["action_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) min_action_range_tensor_serving, max_action_range_tensor_serving = construct_action_scale_tensor( action_normalization, trainer_params.action_rescale_map) trainer = DDPGTrainer( trainer_params, state_normalization, action_normalization, min_action_range_tensor_serving, max_action_range_tensor_serving, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) state_preprocessor = Preprocessor(state_normalization, params["use_gpu"]) action_preprocessor = Preprocessor(action_normalization, params["use_gpu"]) start_time = time.time() for epoch in range(params["epochs"]): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) tdp.set_type(trainer.dtype) trainer.train(tdp) through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params): logger.info("Running Parametric DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"]) else: in_training_cpe_parameters = None trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) state_normalization = read_norm_file(params["state_norm_data_path"]) action_normalization = read_norm_file(params["action_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = ParametricDQNTrainer( trainer_params, state_normalization, action_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) state_preprocessor = Preprocessor(state_normalization, params["use_gpu"]) action_preprocessor = Preprocessor(action_normalization, params["use_gpu"]) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( None, 100, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, ) else: evaluator = Evaluator( None, 100, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), ) start_time = time.time() for epoch in range(params["epochs"]): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) tdp.set_type(trainer.dtype) trainer.train(tdp, evaluator) evaluator.collect_parametric_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers.cpu().numpy(), logged_state_actions=np.concatenate( (tdp.states.cpu().numpy(), tdp.actions.cpu().numpy()), axis=1), logged_rewards=tdp.rewards.cpu().numpy(), logged_propensities=tdp.propensities.cpu().numpy(), logged_terminals=(1.0 - tdp.not_terminals), possible_state_actions=tdp.state_pas_concat.cpu().numpy(), pas_lens=tdp.possible_actions_lengths.cpu().numpy(), ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe(trainer_params.rl.gamma) evaluator.clear_collected_samples() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) return export_trainer_and_predictor(trainer, params["model_output_path"])