def test_rollout_dict_space(self): register_env("nested", lambda _: NestedDictEnv()) agent = PGTrainer(env="nested", config={"framework": "tf"}) agent.train() path = agent.save() agent.stop() # Test train works on restore agent2 = PGTrainer(env="nested", config={"framework": "tf"}) agent2.restore(path) agent2.train() # Test rollout works on restore rollout(agent2, "nested", 100)
def testRolloutDictSpace(self): register_env("nested", lambda _: NestedDictEnv()) agent = PGTrainer(env="nested") agent.train() path = agent.save() agent.stop() # Test train works on restore agent2 = PGTrainer(env="nested") agent2.restore(path) agent2.train() # Test rollout works on restore rollout(agent2, "nested", 100)
len_moving_average = np.convolve(episode_len_mean, np.ones((20, )) / 20, mode='valid') reward_moving_average = np.convolve(episode_reward_mean, np.ones((20, )) / 20, mode='valid') print('Current ::: Len:: Mean: ' + str(episode_len_mean[-1]) + '; Reward:: Mean: ' + str(episode_reward_mean[-1]) + ', Max: ' + str(episode_reward_max[-1]) + ', Min: ' + str(episode_reward_min[-1])) print('mAverage20 ::: Len:: Mean: ' + str(np.round(len_moving_average[-1], 1)) + '; Reward:: Mean: ' + str(np.round(reward_moving_average[-1], 1))) if result['training_iteration'] % 50 == 0: checkpoint = PG_trainer.save() print("checkpoint saved at", checkpoint) output = { 'episode_len_mean': episode_len_mean, 'episode_reward_mean': episode_reward_mean, 'episode_reward_max': episode_reward_max, 'episode_reward_min': episode_reward_min, 'num_steps_trained': num_steps_trained, 'clock_time': clock_time, 'training_iteration': training_iteration, 'len_moving_average': len_moving_average, 'reward_moving_average': reward_moving_average } output_path = PG_trainer._logdir + '/_running_results.pkl' with open(output_path, 'wb') as handle: pickle.dump(output, handle, protocol=pickle.HIGHEST_PROTOCOL)
config={ # Use a single process to avoid needing to set up a load balancer "num_workers": 0, # "multiagent": { # # "grouping": # # grouping, # "policies": { # # the first tuple value is None -> uses default policy # "function_1": (None, obs_space_1, action_space_1, {}), # "function_2": (None, obs_space_2, action_space_2, {}) # }, # "policy_mapping_fn": # # tune.function(lambda agent_id: "agent_{}".format(agent_id+1)), # tune.function(lambda agent_id: "function_1" if agent_id == "group_1" else "function_2"), # }, }) # Attempt to restore from checkpoint if possible. # if os.path.exists(CHECKPOINT_FILE): # checkpoint_path = open(CHECKPOINT_FILE).read() # print("Restoring from checkpoint path", checkpoint_path) # dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)