config['target_network_update_freq'] = 0 config['timesteps_per_iteration'] = 50000 config['train_batch_size'] = 128 config['lr'] = 0.0050 # === Evaluation === config['evaluation_interval'] = 50 config['evaluation_num_episodes'] = 5 agent = ApexTrainer(config, "TetrisA-v2") reward = -999 epoch = 0 # This has a memory leak. After around 25 iterations it consumes all its object store memory # Then it explodes after being unable to put any more items into the object store. while reward < 200: result = agent.train() print(f'=========== RESULT {epoch} =================') result = dict(result) print(result) reward = result['episode_reward_mean'] if np.isnan(reward): reward = -999 # Move to next epoch epoch += 1
def main(argv): ModelCatalog.register_custom_model("my_model", MyModelClass) model = { # cusom model options "custom_model": "my_model", "custom_preprocessor": None, # Extra options to pass to the custom classes "custom_options": {}, # built in options # Number of hidden layers for fully connected net "fcnet_hiddens": [256, 256, 256, 256], } num_workers = 2 # read out command line arguments try: opts, args = getopt.getopt(argv, "hn:", ["number-worker="]) except getopt.GetoptError: print('ray_server.py -n <number-worker>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('ray_server.py -n <number-worker>') print('-n --number-worker - number of worker to start') sys.exit() elif opt in ("-n", "--number-worker"): num_workers = int(arg) ray.init() print("[RAY] Initialized") register_env("srv", lambda _: CartpoleServing()) if ALGORITHM == "APEX": dqn = ApexTrainer( env="srv", config={ # model "model": model, "gamma": 0.99, "noisy": False, "num_gpus": 1, # evaluation # everything default, see dqn.py #exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py #replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.0001, # Size of rollout batch # Default sample batch size (unroll length). Batches of this size are # collected from workers until train_batch_size is met. When using # multiple envs per worker, this is multiplied by num_envs_per_worker. "sample_batch_size": 4, # Training batch size, if applicable. Should be >= sample_batch_size. # Samples batches will be concatenated together to this size for training. "train_batch_size": 64, # How many steps of the model to sample before learning starts "learning_starts": 50000, #parallelism "num_workers": num_workers, # distribute epsilon over workers (default for apex) "per_worker_exploration": True, # determine per worker which experience should be prioritized, before giving those to the # shared experience memory "worker_side_prioritization": True, # "schedule_max_timesteps": 100000, # was tut es? # "timesteps_per_iteration": 25000, # was tut es? # "min_iter_time_s": 30, # was tut es? }) else: dqn = DQNTrainer( env="srv", config={ # model # mehrere Threads fuer worker! fuer debugging auf false setzen # "sample_async": True, # "grad_clip": 0.5, "model": model, "gamma": 0.99, "noisy": False, "num_gpus": 1, # Whether to use dueling dqn "dueling": False, # Whether to use double dqn "double_q": False, # evaluation # everything default, see dqn.py # exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py # replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": False, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.0001, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. #"sample_batch_size": 1024, # How many steps of the model to sample before learning starts "learning_starts": 50000, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. (Minibatch size) hould be >= sample_batch_size # Samples batches will be concatenated together to this size for training. "train_batch_size": 2048, # parallelism # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": num_workers, # distribute epsilon over workers "per_worker_exploration": True, # compute worker side prioritazation (False, because in DQN this was not ipmlemented) "worker_side_prioritization": False, }) # write policy graph to tensorboard (for debugging purposes) policy_graph = dqn.local_evaluator.policy_map["default_policy"].sess.graph writer = tf.summary.FileWriter(dqn._result_logger.logdir, policy_graph) writer.close() # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)