def get_pg_train(name, pols, env, logdir, gamma, shape, lr, batch_size): config = { "gamma": gamma, "sample_batch_size": batch_size, "lr": lr, #0.01 is too high, 0.0001 is too low , 0.001 seems to work (first 5 are 0.005, last are 0.001) "multiagent": { "policies": pols, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": [name], }, "model": { "fcnet_activation": "tanh", # Number of hidden layers for fully connected net "fcnet_hiddens": shape, }, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "NoFilter", "callbacks": { "on_episode_end": partial(on_episode_end, gamma=gamma) } } return PGTrainer(env=env, config=config, logger_creator=lambda _: UnifiedLogger(config, logdir))
def run_with_custom_entropy_loss(): """Example of customizing the loss function of an existing policy. This performs about the same as the default loss does.""" def entropy_policy_gradient_loss(policy, batch_tensors): actions = batch_tensors["actions"] advantages = batch_tensors["advantages"] return (-0.1 * policy.action_dist.entropy() - tf.reduce_mean(policy.action_dist.logp(actions) * advantages)) EntropyPolicy = PGTFPolicy.with_updates( loss_fn=entropy_policy_gradient_loss) EntropyLossPG = PGTrainer.with_updates( name="EntropyPG", get_policy_class=lambda _: EntropyPolicy) run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG)
def run_with_custom_entropy_loss(): """Example of customizing the loss function of an existing policy. This performs about the same as the default loss does.""" def entropy_policy_gradient_loss(policy, model, dist_class, train_batch): logits, _ = model.from_batch(train_batch) action_dist = dist_class(logits, model) return (-0.1 * action_dist.entropy() - tf.reduce_mean( action_dist.logp(train_batch["actions"]) * train_batch["advantages"])) EntropyPolicy = PGTFPolicy.with_updates( loss_fn=entropy_policy_gradient_loss) EntropyLossPG = PGTrainer.with_updates( name="EntropyPG", get_policy_class=lambda _: EntropyPolicy) run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG)
# Use the connector server to generate experiences. "input": ( lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, SERVER_PORT) ), "observation_size": args.observation_size, "action_size": args.action_size, }, }) elif args.run == "PG": agent = PGTrainer( env="srv", config={ "num_workers": 0, "env_config": { # Use the connector server to generate experiences. "input": ( lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, SERVER_PORT) ), "observation_size": args.observation_size, "action_size": args.action_size, }, }) else: raise ValueError("--run must be DQN or PG") # Attempt to restore from checkpoint if possible. if os.path.exists(args.checkpoint_file): checkpoint_file = open(args.checkpoint_file).read() print("Restoring from checkpoint path", checkpoint_file) agent.restore(checkpoint_file)