def test_gpus_in_non_local_mode(self): # Non-local mode. ray.init() actual_gpus = torch.cuda.device_count() print(f"Actual GPUs found (by torch): {actual_gpus}") config = A2C_DEFAULT_CONFIG.copy() config["num_workers"] = 2 config["env"] = "CartPole-v0" # Expect errors when we run a config w/ num_gpus>0 w/o a GPU # and _fake_gpus=False. for num_gpus in [0, 0.1, 1, actual_gpus + 4]: # Only allow possible num_gpus_per_worker (so test would not # block infinitely due to a down worker). per_worker = [0] if actual_gpus == 0 or actual_gpus < num_gpus \ else [0, 0.5, 1] for num_gpus_per_worker in per_worker: for fake_gpus in [False] + ([] if num_gpus == 0 else [True]): config["num_gpus"] = num_gpus config["num_gpus_per_worker"] = num_gpus_per_worker config["_fake_gpus"] = fake_gpus print(f"\n------------\nnum_gpus={num_gpus} " f"num_gpus_per_worker={num_gpus_per_worker} " f"_fake_gpus={fake_gpus}") frameworks = ("tf", "torch") if num_gpus > 1 else \ ("tf2", "tf", "torch") for _ in framework_iterator(config, frameworks=frameworks): # Expect that trainer creation causes a num_gpu error. if actual_gpus < num_gpus + 2 * num_gpus_per_worker \ and not fake_gpus: # "Direct" RLlib (create Trainer on the driver). # Cannot run through ray.tune.run() as it would # simply wait infinitely for the resources to # become available. print("direct RLlib") self.assertRaisesRegex( RuntimeError, "Found 0 GPUs on your machine", lambda: A2CTrainer(config, env="CartPole-v0"), ) # If actual_gpus >= num_gpus or faked, # expect no error. else: print("direct RLlib") trainer = A2CTrainer(config, env="CartPole-v0") trainer.stop() # Cannot run through ray.tune.run() w/ fake GPUs # as it would simply wait infinitely for the # resources to become available (even though, we # wouldn't really need them). if num_gpus == 0: print("via ray.tune.run()") tune.run("A2C", config=config, stop={"training_iteration": 0}) ray.shutdown()
def a2cTraining(env='MiniGrid-TrapMazeS9N5-v0', config=config, iterations=10000): trainer = A2CTrainer(env=env, config=config) for it in range(iterations): print("it {}".format(it)) print(trainer.train())
def test_gpus_in_local_mode(self): # Local mode. ray.init(local_mode=True) actual_gpus_available = torch.cuda.device_count() config = A2C_DEFAULT_CONFIG.copy() config["num_workers"] = 2 config["env"] = "CartPole-v0" # Expect no errors in local mode. for num_gpus in [0, 0.1, 1, actual_gpus_available + 4]: print(f"num_gpus={num_gpus}") for fake_gpus in [False, True]: print(f"_fake_gpus={fake_gpus}") config["num_gpus"] = num_gpus config["_fake_gpus"] = fake_gpus frameworks = ("tf", "torch") if num_gpus > 1 else \ ("tf2", "tf", "torch") for _ in framework_iterator(config, frameworks=frameworks): print("direct RLlib") trainer = A2CTrainer(config, env="CartPole-v0") trainer.stop() print("via ray.tune.run()") tune.run("A2C", config=config, stop={"training_iteration": 0}) ray.shutdown()
# A2C print("Training algorithm: A2C") trainer = A2CTrainer( env=env_title, config={ "num_workers": num_workers, "num_cpus_per_worker": num_cpus_per_worker, "num_gpus": num_gpus, "num_gpus_per_worker": num_gpus_per_worker, "model": nw_model, "lr": lr, "gamma": gamma, "lambda": lambda_trainer, "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["agent_policy{}".format(i) for i in range(n_agents)], }, "callbacks": { "on_episode_start": tune.function(on_episode_start), "on_episode_step": tune.function(on_episode_step), "on_episode_end": tune.function(on_episode_end), }, "log_level": "ERROR", }) elif(train_algo == "DDPG"): # Deep Deterministic Policy Gradient (DDPG)
def training_pipeline(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil(config["train_batch_size"] / config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches( min_batch_size=config["microbatch_size"])).for_each( ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. train_op = rollouts \ .combine(ConcatBatches( min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) return StandardMetricsReporting(train_op, workers, config) A2CPipeline = A2CTrainer.with_updates(training_pipeline=training_pipeline)
def choose_policy_optimizer_modified(workers, config): if config["microbatch_size"]: raise ValueError() else: return SyncSamplesOptimizer( workers, train_batch_size=config["train_batch_size"], standardize_fields=["advantages"] # <<== Here! ) ANA2CTrainer = A2CTrainer.with_updates( name="ANA2C", make_policy_optimizer=choose_policy_optimizer_modified, default_policy=ANA3CTFPolicy, get_policy_class=get_policy_class_modified) if __name__ == '__main__': import yaml import argparse parser = argparse.ArgumentParser() parser.add_argument("--file", "-f", type=str, default="") parser.add_argument("--num-samples", type=int, default=5) args = parser.parse_args() if args.file: with open(args.file, "r") as f: config = yaml.safe_load(f)