def test_minibatches_per_step(self): _epochs = self.epochs self.epochs = 2 rl_parameters = RLParameters(gamma=0.95, target_update_rate=0.9, maxq_learning=True) rainbow_parameters = RainbowDQNParameters(double_q_learning=True, dueling_architecture=False) training_parameters1 = TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=1024, minibatches_per_step=1, learning_rate=0.25, optimizer="ADAM", ) training_parameters2 = TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=128, minibatches_per_step=8, learning_rate=0.25, optimizer="ADAM", ) env1 = Env(self.state_dims, self.action_dims) env2 = Env(self.state_dims, self.action_dims) model_parameters1 = DiscreteActionModelParameters( actions=env1.actions, rl=rl_parameters, rainbow=rainbow_parameters, training=training_parameters1, ) model_parameters2 = DiscreteActionModelParameters( actions=env2.actions, rl=rl_parameters, rainbow=rainbow_parameters, training=training_parameters2, ) # minibatch_size / 8, minibatches_per_step * 8 should give the same result logger.info("Training model 1") trainer1 = self._train(model_parameters1, env1) SummaryWriterContext._reset_globals() logger.info("Training model 2") trainer2 = self._train(model_parameters2, env2) weight1 = trainer1.q_network.fc.layers[-1].weight.detach().numpy() weight2 = trainer2.q_network.fc.layers[-1].weight.detach().numpy() # Due to numerical stability this tolerance has to be fairly high self.assertTrue(np.allclose(weight1, weight2, rtol=0.0, atol=1e-3)) self.epochs = _epochs
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) env.seed(42) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters( gamma=0.99, target_update_rate=1.0, reward_burnin=100, maxq_learning=True, ), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=self.minibatch_size, learning_rate=1.0, optimizer="ADAM", ), ) maxq_trainer = DiscreteActionTrainer(maxq_parameters, env.normalization) # predictor = maxq_trainer.predictor() logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions = env.generate_samples_discrete( self.num_samples) logger.info("Preprocessing constant_reward MDPs..") tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, self.minibatch_size, ) for epoch in range(self.epochs): logger.info("Training.. " + str(epoch)) for tdp in tdps: maxq_trainer.train_numpy(tdp, None) logger.info(" ".join([ "Training epoch", str(epoch), "average q values", str(np.mean(workspace.FetchBlob(maxq_trainer.q_score_output))), "td_loss", str(workspace.FetchBlob(maxq_trainer.loss_blob)), ])) # Q value should converge to very close to 100 avg_q_value_after_training = np.mean( workspace.FetchBlob(maxq_trainer.q_score_output)) self.assertLess(avg_q_value_after_training, 101) self.assertGreater(avg_q_value_after_training, 99)
def get_sarsa_trainer_reward_boost(self, environment, reward_shape): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=10, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, -1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.125, optimizer="ADAM", ) return DiscreteActionTrainer( DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1), ), environment.normalization, )
def test_no_soft_update(self): model = Model() target_model = copy.deepcopy(model) for target_param, param in zip(model.parameters(), target_model.parameters()): self.assertIs(target_param, param) optimizer = torch.optim.Adam(model.parameters()) x = torch.tensor([1, 2], dtype=torch.int64) emb = model(x) loss = emb.sum() loss.backward() optimizer.step() params = list(model.parameters()) self.assertEqual(1, len(params)) param = params[0].detach().numpy() trainer = RLTrainer(DiscreteActionModelParameters(rl=RLParameters()), use_gpu=False) trainer._soft_update(model, target_model, 0.1) target_params = list(target_model.parameters()) self.assertEqual(1, len(target_params)) target_param = target_params[0].detach().numpy() npt.assert_array_equal(target_param, param)
def get_sarsa_trainer_reward_boost( self, environment, reward_shape, dueling, use_gpu=False, use_all_avail_gpus=False, ): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=10, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, 128, -1] if dueling else [-1, -1], activations=["relu", "linear"] if dueling else ["linear"], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", ) return DQNTrainer( DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, rainbow=RainbowDQNParameters( double_q_learning=True, dueling_architecture=dueling ), ), environment.normalization, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, )
def get_sarsa_parameters(self, environment, reward_shape, dueling, categorical, clip_grad_norm): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, 128, -1] if dueling else [-1, -1], activations=["relu", "linear"] if dueling else ["linear"], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", clip_grad_norm=clip_grad_norm, ) return DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, rainbow=RainbowDQNParameters( double_q_learning=True, dueling_architecture=dueling, categorical=categorical, ), )
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) env.seed(42) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters( gamma=0.99, target_update_rate=0.9, reward_burnin=100, maxq_learning=True, ), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=self.minibatch_size, learning_rate=0.25, optimizer="ADAM", ), ) maxq_trainer = DQNTrainer(maxq_parameters, env.normalization) logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions = env.generate_samples_discrete( self.num_samples) logger.info("Preprocessing constant_reward MDPs..") for epoch in range(self.epochs): tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions, self.minibatch_size, ) logger.info("Training.. " + str(epoch)) for tdp in tdps: maxq_trainer.train(tdp) logger.info(" ".join([ "Training epoch", str(epoch), "average q values", str(torch.mean(maxq_trainer.all_action_scores)), ])) # Q value should converge to very close to 100 avg_q_value_after_training = torch.mean(maxq_trainer.all_action_scores) self.assertLess(avg_q_value_after_training, 101) self.assertGreater(avg_q_value_after_training, 99)
def test_pure_q_learning_all_cheat(self): q_learning_parameters = DiscreteActionModelParameters( actions=self._env.ACTIONS, rl=self._rl_parameters_all_cheat_maxq, training=TrainingParameters( layers=[self._env.width * self._env.height, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer='SGD', lr_policy='fixed', ) ) trainer = DiscreteActionTrainer( q_learning_parameters, self._env.normalization, ) predictor = trainer.predictor() policy = _build_policy(self._env, predictor, 1) initial_state = self._env.reset() iteration_result = _collect_samples( self._env, policy, 20000, initial_state ) num_iterations = 50 for _ in range(num_iterations): tdps = self._env.preprocess_samples( iteration_result.states, iteration_result.actions, iteration_result.rewards, iteration_result.next_states, iteration_result.next_actions, iteration_result.is_terminals, iteration_result.possible_next_actions, None, self.minibatch_size, ) for tdp in tdps: trainer.train_numpy(tdp, None) initial_state = self._env.reset() policy = _build_policy(self._env, predictor, 0.1) iteration_result = _collect_samples( self._env, policy, 20000, initial_state ) policy = _build_policy(self._env, predictor, 0) initial_state = self._env.reset() iteration_result = _collect_samples( self._env, policy, 1000, initial_state ) # 100% should be cheat. Will fix in the future. self.assertGreater( np.sum(np.array(iteration_result.actions) == 'C'), 800 )
def main(params): # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) model_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) state_normalization = BaseWorkflow.read_norm_file( params["state_norm_data_path"]) writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("TensorBoard logging location is: {}".format(writer.log_dir)) preprocess_handler = DqnPreprocessHandler( Preprocessor(state_normalization, False), np.array(model_params.actions), PandasSparseToDenseProcessor(), ) workflow = DqnWorkflow( model_params, preprocess_handler, state_normalization, params["use_gpu"], params["use_all_avail_gpus"], ) train_dataset = JSONDatasetReader( params["training_data_path"], batch_size=training_parameters.minibatch_size) eval_dataset = JSONDatasetReader(params["eval_data_path"], batch_size=16) with summary_writer_context(writer): workflow.train_network(train_dataset, eval_dataset, int(params["epochs"])) exporter = DQNExporter( workflow.trainer.q_network, PredictorFeatureExtractor( state_normalization_parameters=state_normalization), DiscreteActionOutputTransformer(model_params.actions), ) return export_trainer_and_predictor(workflow.trainer, params["model_output_path"], exporter=exporter) # noqa
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=TrainingParameters( layers=[-1, 1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer="ADAM", ), ) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldEvaluator(environment, True) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.3) for _ in range(5): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True), training=TrainingParameters( layers=[-1, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer='ADAM', )) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization, ) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) evaluator = GridworldEvaluator(environment, True) print("Pre-Training eval", evaluator.evaluate(predictor)) self.assertGreater(evaluator.evaluate(predictor), 0.3) for _ in range(2): for tdp in tdps: maxq_trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) print("Post-Training eval", evaluator.evaluate(predictor)) self.assertLess(evaluator.evaluate(predictor), 0.1)
def train_network(params): logger.info("Running DQN workflow with params:") logger.info(params) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) norm_data = JSONDataset(params["state_norm_data_path"]) state_normalization = read_norm_params(norm_data.read_all()) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"]) for epoch in range(params["epochs"]): for batch_idx in range(num_batches): helpers.report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(action_names, batch, state_normalization) trainer.train(tdp) logger.info("Training finished. Saving PyTorch model to {}".format( params["pytorch_output_path"])) helpers.save_model_to_file(trainer, params["pytorch_output_path"])
def get_sarsa_trainer(self, environment): rl_parameters = RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=False) training_parameters = TrainingParameters( layers=[-1, 1], activations=['linear'], minibatch_size=1024, learning_rate=0.01, optimizer='ADAM', ) return DiscreteActionTrainer( environment.normalization, DiscreteActionModelParameters(actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters))
def test_sarsa_layer_validation(self): env = Gridworld() invalid_sarsa_params = DiscreteActionModelParameters( actions=env.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=False), training=TrainingParameters( layers=[-1, 3], activations=['linear'], minibatch_size=32, learning_rate=0.1, optimizer='SGD', )) with self.assertRaises(Exception): # layers[-1] should be 1 DiscreteActionTrainer(env.normalization, invalid_sarsa_params)
def main(args): parser = argparse.ArgumentParser( description="Train a RL net to play in an OpenAI Gym environment.") parser.add_argument("-p", "--parameters", help="Path to JSON parameters file.") parser.add_argument("-s", "--score-bar", help="Bar for averaged tests scores.", type=float, default=None) parser.add_argument( "-g", "--gpu_id", help="If set, will use GPU with specified ID. Otherwise will use CPU.", default=USE_CPU) args = parser.parse_args(args) with open(args.parameters, 'r') as f: params = json.load(f) rl_settings = params['rl'] training_settings = params['training'] rl_settings['gamma'] = rl_settings['reward_discount_factor'] del rl_settings['reward_discount_factor'] training_settings['gamma'] = training_settings['learning_rate_decay'] del training_settings['learning_rate_decay'] env_type = params['env'] env = OpenAIGymEnvironment(env_type, rl_settings['epsilon']) trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings)) device = core.DeviceOption( caffe2_pb2.CPU if args.gpu_id == USE_CPU else caffe2_pb2.CUDA, args.gpu_id) with core.DeviceScope(device): trainer = DiscreteActionTrainer(env.normalization, trainer_params, skip_normalization=True) return run(env, trainer, "{} test run".format(env_type), args.score_bar, **params["run_details"])
def test_pure_q_learning_all_cheat(self): q_learning_parameters = DiscreteActionModelParameters( actions=self._env.ACTIONS, rl=self._rl_parameters_all_cheat_maxq, training=TrainingParameters( layers=[self._env.width * self._env.height, 1], activations=['linear'], minibatch_size=32, learning_rate=0.05, optimizer='SGD', lr_policy='fixed')) trainer = DiscreteActionTrainer(self._env.normalization, q_learning_parameters) predictor = trainer.predictor() policy = _build_policy(self._env, predictor, 1) initial_state = self._env.reset() iteration_result = _collect_samples(self._env, policy, 10000, initial_state) num_iterations = 50 for _ in range(num_iterations): policy = _build_policy(self._env, predictor, 0) tdp = self._env.preprocess_samples( iteration_result.states, iteration_result.actions, iteration_result.rewards, iteration_result.next_states, iteration_result.next_actions, iteration_result.is_terminals, iteration_result.possible_next_actions, None, ) trainer.stream_tdp(tdp, None) initial_state = iteration_result.current_state initial_state = self._env.reset() iteration_result = _collect_samples(self._env, policy, 10000, initial_state) self.assertTrue(np.all(np.array(iteration_result.actions) == 'C'))
def get_sarsa_trainer_reward_boost(self, environment, reward_shape): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, -1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer="ADAM", ) return DiscreteActionTrainer( DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, ), environment.normalization, )
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(gamma=0.95, target_update_rate=0.9, maxq_learning=True), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=1024, learning_rate=0.25, optimizer="ADAM", ), ) # Q value should converge to very close to 20 trainer = self._train(maxq_parameters, env) avg_q_value_after_training = torch.mean(trainer.all_action_scores) self.assertLess(avg_q_value_after_training, 22) self.assertGreater(avg_q_value_after_training, 18)
def run_gym(params, score_bar, gpu_id): rl_settings = params['rl'] training_settings = params['training'] rl_settings['gamma'] = rl_settings['reward_discount_factor'] del rl_settings['reward_discount_factor'] training_settings['gamma'] = training_settings['learning_rate_decay'] del training_settings['learning_rate_decay'] env_type = params['env'] env = OpenAIGymEnvironment(env_type, rl_settings['epsilon']) trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings)) device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id, ) with core.DeviceScope(device): if env.img: trainer = DiscreteActionConvTrainer( DiscreteActionConvModelParameters( fc_parameters=trainer_params, cnn_parameters=CNNModelParameters(**params['cnn']), num_input_channels=env.num_input_channels, img_height=env.height, img_width=env.width), env.normalization, ) else: trainer = DiscreteActionTrainer( trainer_params, env.normalization, ) return run(env, trainer, "{} test run".format(env_type), score_bar, **params["run_details"])
def run_gym(params, score_bar, gpu_id): rl_settings = params['rl'] rl_settings['gamma'] = rl_settings['reward_discount_factor'] del rl_settings['reward_discount_factor'] env_type = params['env'] env = OpenAIGymEnvironment(env_type, rl_settings['epsilon']) model_type = params['model_type'] c2_device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id, ) if model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_settings = params['training'] training_settings['gamma'] = training_settings[ 'learning_rate_decay'] del training_settings['learning_rate_decay'] trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings)) if env.img: trainer = DiscreteActionConvTrainer( DiscreteActionConvModelParameters( fc_parameters=trainer_params, cnn_parameters=CNNModelParameters(**params['cnn']), num_input_channels=env.num_input_channels, img_height=env.height, img_width=env.width), env.normalization, ) else: trainer = DiscreteActionTrainer( trainer_params, env.normalization, ) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_settings = params['training'] training_settings['gamma'] = training_settings[ 'learning_rate_decay'] del training_settings['learning_rate_decay'] trainer_params = ContinuousActionModelParameters( rl=RLParameters(**rl_settings), training=TrainingParameters(**training_settings), knn=KnnParameters(model_type='DQN', ), ) trainer = ContinuousActionDQNTrainer(trainer_params, env.normalization, env.normalization_action) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_settings = params['shared_training'] training_settings['gamma'] = training_settings['learning_rate_decay'] del training_settings['learning_rate_decay'] actor_settings = params['actor_training'] critic_settings = params['critic_training'] trainer_params = DDPGModelParameters( rl=DDPGRLParameters(**rl_settings), shared_training=DDPGTrainingParameters(**training_settings), actor_training=DDPGNetworkParameters(**actor_settings), critic_training=DDPGNetworkParameters(**critic_settings), ) trainer = DDPGTrainer( trainer_params, EnvDetails( state_dim=env.state_dim, action_dim=env.action_dim, action_range=(env.action_space.low, env.action_space.high), )) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return run(env, model_type, trainer, "{} test run".format(env_type), score_bar, **params["run_details"])
def create_trainer(model_type, params, rl_parameters, use_gpu, env): if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" if isinstance(training_parameters.cnn_parameters, dict): training_parameters.cnn_parameters = CNNParameters( **training_parameters.cnn_parameters) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = DQNTrainer(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters) trainer = ParametricDQNTrainer(trainer_params, env.normalization, env.normalization_action, use_gpu) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_parameters = params["shared_training"] if isinstance(training_parameters, dict): training_parameters = DDPGTrainingParameters(**training_parameters) actor_parameters = params["actor_training"] if isinstance(actor_parameters, dict): actor_parameters = DDPGNetworkParameters(**actor_parameters) critic_parameters = params["critic_training"] if isinstance(critic_parameters, dict): critic_parameters = DDPGNetworkParameters(**critic_parameters) trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=training_parameters, actor_training=actor_parameters, critic_training=critic_parameters, ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) elif model_type == ModelType.SOFT_ACTOR_CRITIC.value: trainer_params = SACModelParameters( rl=rl_parameters, training=SACTrainingParameters( minibatch_size=params["sac_training"]["minibatch_size"], use_2_q_functions=params["sac_training"]["use_2_q_functions"], q_network_optimizer=OptimizerParameters( **params["sac_training"]["q_network_optimizer"]), value_network_optimizer=OptimizerParameters( **params["sac_training"]["value_network_optimizer"]), actor_network_optimizer=OptimizerParameters( **params["sac_training"]["actor_network_optimizer"]), entropy_temperature=params["sac_training"] ["entropy_temperature"], ), q_network=FeedForwardParameters(**params["sac_q_training"]), value_network=FeedForwardParameters( **params["sac_value_training"]), actor_network=FeedForwardParameters( **params["sac_actor_training"]), ) trainer = get_sac_trainer(env, trainer_params, use_gpu) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return trainer
def test_q_learning_limited(self): # TODO: This model oscilliates pretty bad, will investigate in the future. target_cheat_percentage = 50 epsilon = 0.2 num_iterations = 30 self.minibatch_size = 1024 num_steps = self.minibatch_size * 10 updates_per_iteration = 1 q_learning_parameters = DiscreteActionModelParameters( actions=self._env.ACTIONS, rl=self._rl_parameters_maxq, training=TrainingParameters( layers=[-1, -1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", ), action_budget=ActionBudget( limited_action="C", action_limit=target_cheat_percentage, quantile_update_rate=0.2, quantile_update_frequency=1, window_size=1000, ), ) trainer = LimitedActionDiscreteActionTrainer(q_learning_parameters, self._env.normalization) predictor = trainer.predictor() policy = _build_policy(self._env, predictor, epsilon) initial_state = self._env.reset() for iteration in range(num_iterations): policy = _build_policy(self._env, predictor, epsilon) iteration_result = _collect_samples(self._env, policy, num_steps, initial_state) tdps = self._env.preprocess_samples( iteration_result.states, iteration_result.actions, iteration_result.propensities, iteration_result.rewards, iteration_result.next_states, iteration_result.next_actions, iteration_result.is_terminals, iteration_result.possible_next_actions, None, self.minibatch_size, ) print("iter: {} ({}), ratio: {}, steps to solve: {}, quantile: {}". format( iteration, num_steps, iteration_result.cheat_ratio, np.mean(iteration_result.lengths), trainer.quantile_value, )) initial_state = iteration_result.current_state for _ in range(updates_per_iteration): for tdp in tdps: trainer.train_numpy(tdp, None) state = self._env.reset() evaluation_results = _collect_samples(self._env, policy, 10000, state) print( np.sum(np.array(evaluation_results.lengths) <= 14) / len(evaluation_results.lengths)) optimality_ratio = np.sum( np.array(evaluation_results.lengths) <= 14) / len( evaluation_results.lengths) self.assertGreater(optimality_ratio, 0.5) accuracy = np.abs(evaluation_results.cheat_ratio - target_cheat_percentage / 100) print("ACCURACY", evaluation_results.cheat_ratio, target_cheat_percentage) self.assertTrue( accuracy < 0.4) # TODO: Would like to get this accuracy up in the future
def run_gym(params, score_bar, gpu_id, save_timesteps_to_dataset=None): logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, params["max_replay_memory_size"], ) model_type = params["model_type"] c2_device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id ) if model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"] ) training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters ) trainer = DiscreteActionTrainer(trainer_params, env.normalization) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"] ) training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ContinuousActionDQNTrainer( trainer_params, env.normalization, env.normalization_action ) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_settings = params["shared_training"] actor_settings = params["actor_training"] critic_settings = params["critic_training"] trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=DDPGTrainingParameters(**training_settings), actor_training=DDPGNetworkParameters(**actor_settings), critic_training=DDPGNetworkParameters(**critic_settings), ) # DDPG can handle continuous and discrete action spaces if env.action_type == EnvType.CONTINUOUS_ACTION: action_range = env.action_space.high else: action_range = None trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, use_gpu=False, action_range=action_range, ) else: raise NotImplementedError("Model of type {} not supported".format(model_type)) return run( c2_device, env, model_type, trainer, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, )
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("Running DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) eval_dataset = JSONDataset(params["eval_data_path"], batch_size=16) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = DQNTrainer( trainer_params, state_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, False) evaluator = Evaluator( trainer_params.actions, trainer_params.rl.gamma, trainer, metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(int(params["epochs"])): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(preprocessor, batch, action_names) tdp.set_type(trainer.dtype) trainer.train(tdp) eval_dataset.reset_iterator() accumulated_edp = None while True: batch = eval_dataset.read_batch(batch_idx) if batch is None: break tdp = preprocess_batch_for_training(preprocessor, batch, action_names) edp = EvaluationDataPage.create_from_tdp(tdp, trainer) if accumulated_edp is None: accumulated_edp = edp else: accumulated_edp = accumulated_edp.append(edp) accumulated_edp = accumulated_edp.compute_values(trainer.gamma) cpe_start_time = time.time() details = evaluator.evaluate_post_training(accumulated_edp) details.log() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def create_trainer(model_type, params, rl_parameters, use_gpu, env): if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" if isinstance(training_parameters.cnn_parameters, dict): training_parameters.cnn_parameters = CNNParameters( **training_parameters.cnn_parameters ) training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = create_dqn_trainer_from_params( trainer_params, env.normalization, use_gpu ) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters ) trainer = create_parametric_dqn_trainer_from_params( trainer_params, env.normalization, env.normalization_action, use_gpu ) elif model_type == ModelType.TD3.value: trainer_params = TD3ModelParameters( rl=rl_parameters, training=TD3TrainingParameters( minibatch_size=params["td3_training"]["minibatch_size"], q_network_optimizer=OptimizerParameters( **params["td3_training"]["q_network_optimizer"] ), actor_network_optimizer=OptimizerParameters( **params["td3_training"]["actor_network_optimizer"] ), use_2_q_functions=params["td3_training"]["use_2_q_functions"], exploration_noise=params["td3_training"]["exploration_noise"], initial_exploration_ts=params["td3_training"]["initial_exploration_ts"], target_policy_smoothing=params["td3_training"][ "target_policy_smoothing" ], noise_clip=params["td3_training"]["noise_clip"], delayed_policy_update=params["td3_training"]["delayed_policy_update"], ), q_network=FeedForwardParameters(**params["td3_q_training"]), actor_network=FeedForwardParameters(**params["td3_actor_training"]), ) trainer = get_td3_trainer(env, trainer_params, use_gpu) elif model_type == ModelType.SOFT_ACTOR_CRITIC.value: value_network = None value_network_optimizer = None alpha_optimizer = None if params["sac_training"]["use_value_network"]: value_network = FeedForwardParameters(**params["sac_value_training"]) value_network_optimizer = OptimizerParameters( **params["sac_training"]["value_network_optimizer"] ) if "alpha_optimizer" in params["sac_training"]: alpha_optimizer = OptimizerParameters( **params["sac_training"]["alpha_optimizer"] ) entropy_temperature = params["sac_training"].get("entropy_temperature", None) target_entropy = params["sac_training"].get("target_entropy", None) trainer_params = SACModelParameters( rl=rl_parameters, training=SACTrainingParameters( minibatch_size=params["sac_training"]["minibatch_size"], use_2_q_functions=params["sac_training"]["use_2_q_functions"], use_value_network=params["sac_training"]["use_value_network"], q_network_optimizer=OptimizerParameters( **params["sac_training"]["q_network_optimizer"] ), value_network_optimizer=value_network_optimizer, actor_network_optimizer=OptimizerParameters( **params["sac_training"]["actor_network_optimizer"] ), entropy_temperature=entropy_temperature, target_entropy=target_entropy, alpha_optimizer=alpha_optimizer, ), q_network=FeedForwardParameters(**params["sac_q_training"]), value_network=value_network, actor_network=FeedForwardParameters(**params["sac_actor_training"]), ) trainer = get_sac_trainer(env, trainer_params, use_gpu) else: raise NotImplementedError("Model of type {} not supported".format(model_type)) return trainer
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter( log_dir=os.path.join( os.path.expanduser(params["model_output_path"]), "training_data" ) ) logger.info("Running DQN workflow with params:") logger.info(params) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"] ) else: in_training_cpe_parameters = None trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset( params["training_data_path"], batch_size=training_parameters.minibatch_size ) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info( "Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, ) ) trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"]) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, params["use_gpu"]) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, ) else: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), ) start_time = time.time() for epoch in range(int(params["epochs"])): for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(preprocessor, batch, action_names) trainer.train(tdp) trainer.evaluate( evaluator, tdp.actions, None, tdp.rewards, tdp.episode_values ) evaluator.collect_discrete_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers.cpu().numpy(), states=tdp.states.cpu().numpy(), logged_actions=tdp.actions.cpu().numpy(), logged_rewards=tdp.rewards.cpu().numpy(), logged_propensities=tdp.propensities.cpu().numpy(), logged_terminals=np.invert( tdp.not_terminals.cpu().numpy().astype(np.bool) ), ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe() if writer is not None: evaluator.log_to_tensorboard(writer, epoch) evaluator.clear_collected_samples() logger.info( "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time) ) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info( "Training finished. Processed ~{} examples / s.".format(round(through_put)) ) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def run_gym( params, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, batch_rl_file_path=None, ): # Caffe2 core uses the min of caffe2_log_level and minloglevel # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info. core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"]) logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, params["max_replay_memory_size"], rl_parameters.gamma, ) model_type = params["model_type"] c2_device = core.DeviceOption( caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id) use_gpu = gpu_id != USE_CPU if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters) trainer = DQNTrainer(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters) trainer = DiscreteActionTrainer(trainer_params, env.normalization) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ParametricDQNTrainer(trainer_params, env.normalization, env.normalization_action, use_gpu) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_settings = params["training"] training_parameters = TrainingParameters(**training_settings) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters = CNNParameters( **training_settings["cnn_parameters"]) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ContinuousActionDQNTrainer(trainer_params, env.normalization, env.normalization_action) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_settings = params["shared_training"] actor_settings = params["actor_training"] critic_settings = params["critic_training"] trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=DDPGTrainingParameters(**training_settings), actor_training=DDPGNetworkParameters(**actor_settings), critic_training=DDPGNetworkParameters(**critic_settings), ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return run( c2_device, env, model_type, trainer, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, batch_rl_file_path=batch_rl_file_path, )
def single_process_main(gpu_index, *args): params = args[0] # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"] ) action_names = params["actions"] rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) model_params = DiscreteActionModelParameters( actions=action_names, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) state_normalization = BaseWorkflow.read_norm_file(params["state_norm_data_path"]) writer = SummaryWriter(log_dir=params["model_output_path"]) logger.info("TensorBoard logging location is: {}".format(writer.log_dir)) if params["use_all_avail_gpus"]: BaseWorkflow.init_multiprocessing( int(params["num_processes_per_node"]), int(params["num_nodes"]), int(params["node_index"]), gpu_index, params["init_method"], ) workflow = DqnWorkflow( model_params, state_normalization, params["use_gpu"], params["use_all_avail_gpus"], ) sorted_features, _ = sort_features_by_normalization(state_normalization) preprocess_handler = DiscreteDqnPreprocessHandler( action_names, PandasSparseToDenseProcessor(sorted_features) ) train_dataset = JSONDatasetReader( params["training_data_path"], batch_size=training_parameters.minibatch_size, preprocess_handler=preprocess_handler, ) eval_dataset = JSONDatasetReader( params["eval_data_path"], batch_size=training_parameters.minibatch_size, preprocess_handler=preprocess_handler, ) with summary_writer_context(writer): workflow.train_network(train_dataset, eval_dataset, int(params["epochs"])) exporter = DQNExporter( workflow.trainer.q_network, PredictorFeatureExtractor(state_normalization_parameters=state_normalization), DiscreteActionOutputTransformer(model_params.actions), ) if int(params["node_index"]) == 0 and gpu_index == 0: export_trainer_and_predictor( workflow.trainer, params["model_output_path"], exporter=exporter ) # noqa
def create_park_trainer(model_type, params, rl_parameters, use_gpu, env): if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" if isinstance(training_parameters.cnn_parameters, dict): training_parameters.cnn_parameters = CNNParameters( **training_parameters.cnn_parameters) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = create_park_dqn_trainer_from_params( model=trainer_params, normalization_parameters=env.normalization, use_gpu=use_gpu, env=env.env) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters) trainer = create_parametric_dqn_trainer_from_params( trainer_params, env.normalization, env.normalization_action, use_gpu, env=env.env) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_parameters = params["shared_training"] if isinstance(training_parameters, dict): training_parameters = DDPGTrainingParameters(**training_parameters) actor_parameters = params["actor_training"] if isinstance(actor_parameters, dict): actor_parameters = DDPGNetworkParameters(**actor_parameters) critic_parameters = params["critic_training"] if isinstance(critic_parameters, dict): critic_parameters = DDPGNetworkParameters(**critic_parameters) trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=training_parameters, actor_training=actor_parameters, critic_training=critic_parameters, ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) # Build Actor Network actor_network = ActorNetModel( layers=([state_dim] + trainer_params.actor_training.layers[1:-1] + [action_dim]), activations=trainer_params.actor_training.activations, fl_init=trainer_params.shared_training.final_layer_init, state_dim=state_dim, action_dim=action_dim, use_gpu=use_gpu, use_all_avail_gpus=False, ) # Build Critic Network critic_network = CriticNetModel( # Ensure dims match input state and scalar output layers=[state_dim] + \ trainer_params.critic_training.layers[1:-1] + [1], activations=trainer_params.critic_training.activations, fl_init=trainer_params.shared_training.final_layer_init, state_dim=state_dim, action_dim=action_dim, use_gpu=use_gpu, use_all_avail_gpus=False, ) trainer = DDPGTrainer( actor_network, critic_network, trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) elif model_type == ModelType.SOFT_ACTOR_CRITIC.value: value_network = None value_network_optimizer = None if params["sac_training"]["use_value_network"]: value_network = FeedForwardParameters( **params["sac_value_training"]) value_network_optimizer = OptimizerParameters( **params["sac_training"]["value_network_optimizer"]) trainer_params = SACModelParameters( rl=rl_parameters, training=SACTrainingParameters( minibatch_size=params["sac_training"]["minibatch_size"], use_2_q_functions=params["sac_training"]["use_2_q_functions"], use_value_network=params["sac_training"]["use_value_network"], q_network_optimizer=OptimizerParameters( **params["sac_training"]["q_network_optimizer"]), value_network_optimizer=value_network_optimizer, actor_network_optimizer=OptimizerParameters( **params["sac_training"]["actor_network_optimizer"]), entropy_temperature=params["sac_training"] ["entropy_temperature"], ), q_network=FeedForwardParameters(**params["sac_q_training"]), value_network=value_network, actor_network=FeedForwardParameters( **params["sac_actor_training"]), ) trainer = horizon_runner.get_sac_trainer(env, trainer_params, use_gpu) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return trainer
def create_trainer(model_type, params, rl_parameters, use_gpu, env): c2_device = core.DeviceOption(caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU) if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = DQNTrainer(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.DISCRETE_ACTION.value: with core.DeviceScope(c2_device): training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters ) trainer = DiscreteActionTrainer(trainer_params, env.normalization) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), rainbow=rainbow_parameters, ) trainer = ParametricDQNTrainer( trainer_params, env.normalization, env.normalization_action, use_gpu ) elif model_type == ModelType.PARAMETRIC_ACTION.value: with core.DeviceScope(c2_device): training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, knn=KnnParameters(model_type="DQN"), ) trainer = ContinuousActionDQNTrainer( trainer_params, env.normalization, env.normalization_action ) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_parameters = params["shared_training"] if isinstance(training_parameters, dict): training_parameters = DDPGTrainingParameters(**training_parameters) actor_parameters = params["actor_training"] if isinstance(actor_parameters, dict): actor_parameters = DDPGNetworkParameters(**actor_parameters) critic_parameters = params["critic_training"] if isinstance(critic_parameters, dict): critic_parameters = DDPGNetworkParameters(**critic_parameters) trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=training_parameters, actor_training=actor_parameters, critic_training=critic_parameters, ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) else: raise NotImplementedError("Model of type {} not supported".format(model_type)) return trainer