def get_apex_trainer(strategy): config = APEX_DEFAULT_CONFIG.copy() config["env"] = MountainCar config["buffer_size"] = 1000000 config["learning_starts"] = 10000 config["target_network_update_freq"] = 50000 config["rollout_fragment_length"] = 200 config["timesteps_per_iteration"] = 10000 config["num_gpus"] = 1 config["num_workers"] = 20 config["evaluation_num_workers"] = 10 config["evaluation_interval"] = 1 if strategy not in [ "with_dueling", "custom_reward_n_dueling", "curriculum_n_dueling", ]: config["hiddens"] = [] config["dueling"] = False if strategy == "action_masking": ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) config["env_config"] = {"use_action_masking": True} config["model"] = { "custom_model": "pa_model", } elif strategy == "custom_reward" or strategy == "custom_reward_n_dueling": config["env_config"] = {"reward_fun": "custom_reward"} elif strategy in ["curriculum", "curriculum_n_dueling"]: config["env_config"] = {"lesson": 0} elif strategy == "demonstration": config["input"] = DEMO_DATA_DIR #config["input"] = {"sampler": 0.7, DEMO_DATA_DIR: 0.3} config["explore"] = False config["input_evaluation"] = [] config["n_step"] = 1 trainer = ApexTrainer(config=config) return trainer, config["env_config"]
config['model']["conv_filters"] = filters_84x84 config['min_iter_time_s'] = 5 config['n_step'] = 2 config['target_network_update_freq'] = 0 config['timesteps_per_iteration'] = 50000 config['train_batch_size'] = 128 config['lr'] = 0.0050 # === Evaluation === config['evaluation_interval'] = 50 config['evaluation_num_episodes'] = 5 agent = ApexTrainer(config, "TetrisA-v2") reward = -999 epoch = 0 # This has a memory leak. After around 25 iterations it consumes all its object store memory # Then it explodes after being unable to put any more items into the object store. while reward < 200: result = agent.train() print(f'=========== RESULT {epoch} =================') result = dict(result) print(result) reward = result['episode_reward_mean'] if np.isnan(reward):
def main(argv): ModelCatalog.register_custom_model("my_model", MyModelClass) model = { # cusom model options "custom_model": "my_model", "custom_preprocessor": None, # Extra options to pass to the custom classes "custom_options": {}, # built in options # Number of hidden layers for fully connected net "fcnet_hiddens": [256, 256, 256, 256], } num_workers = 2 # read out command line arguments try: opts, args = getopt.getopt(argv, "hn:", ["number-worker="]) except getopt.GetoptError: print('ray_server.py -n <number-worker>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('ray_server.py -n <number-worker>') print('-n --number-worker - number of worker to start') sys.exit() elif opt in ("-n", "--number-worker"): num_workers = int(arg) ray.init() print("[RAY] Initialized") register_env("srv", lambda _: CartpoleServing()) if ALGORITHM == "APEX": dqn = ApexTrainer( env="srv", config={ # model "model": model, "gamma": 0.99, "noisy": False, "num_gpus": 1, # evaluation # everything default, see dqn.py #exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py #replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.0001, # Size of rollout batch # Default sample batch size (unroll length). Batches of this size are # collected from workers until train_batch_size is met. When using # multiple envs per worker, this is multiplied by num_envs_per_worker. "sample_batch_size": 4, # Training batch size, if applicable. Should be >= sample_batch_size. # Samples batches will be concatenated together to this size for training. "train_batch_size": 64, # How many steps of the model to sample before learning starts "learning_starts": 50000, #parallelism "num_workers": num_workers, # distribute epsilon over workers (default for apex) "per_worker_exploration": True, # determine per worker which experience should be prioritized, before giving those to the # shared experience memory "worker_side_prioritization": True, # "schedule_max_timesteps": 100000, # was tut es? # "timesteps_per_iteration": 25000, # was tut es? # "min_iter_time_s": 30, # was tut es? }) else: dqn = DQNTrainer( env="srv", config={ # model # mehrere Threads fuer worker! fuer debugging auf false setzen # "sample_async": True, # "grad_clip": 0.5, "model": model, "gamma": 0.99, "noisy": False, "num_gpus": 1, # Whether to use dueling dqn "dueling": False, # Whether to use double dqn "double_q": False, # evaluation # everything default, see dqn.py # exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py # replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": False, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.0001, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. #"sample_batch_size": 1024, # How many steps of the model to sample before learning starts "learning_starts": 50000, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. (Minibatch size) hould be >= sample_batch_size # Samples batches will be concatenated together to this size for training. "train_batch_size": 2048, # parallelism # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": num_workers, # distribute epsilon over workers "per_worker_exploration": True, # compute worker side prioritazation (False, because in DQN this was not ipmlemented) "worker_side_prioritization": False, }) # write policy graph to tensorboard (for debugging purposes) policy_graph = dqn.local_evaluator.policy_map["default_policy"].sess.graph writer = tf.summary.FileWriter(dqn._result_logger.logdir, policy_graph) writer.close() # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)
def execution_plan( workers: WorkerSet, config: dict, **kwargs ) -> LocalIterator[dict]: """Use APEX-DQN's execution plan.""" return ApexTrainer.execution_plan(workers, config, **kwargs)
def training_iteration(self) -> ResultDict: """Use APEX-DQN's training iteration function.""" return ApexTrainer.training_iteration(self)
def setup(self, config: PartialTrainerConfigDict): return ApexTrainer.setup(self, config)