def create_agents(self, pre_trained, path="backups/RL_agent6iter200000.zip"): dummy_agent_4 = DQN.load("backups/Lane_iter200000_lane4.zip") dummy_agent_6 = DQN.load("backups/RL_agent6iter200000.zip") for index, partition in enumerate(self.env.get_partitions()): self.partitions.append(partition) if pre_trained: if partition.num_lanes == 4: agent = copy.copy(dummy_agent_4) else: agent = copy.copy(dummy_agent_6) agent.env = partition agent.exploration_initial_eps = 0 agent.exploration_final_eps = 0 else: agent = DQN(MlpPolicy, partition, verbose=2, gamma=0.9, exploration_fraction=0.6, exploration_final_eps=0) print("Agent added: ", index) self.agents.append(agent)
def run_test(self): env = CustomEnv(self.path_planner, self.behavior_planner, event) env = make_vec_env(lambda: env, n_envs=1) if (self.event == Scenario.LANE_CHANGE): model = DQN.load(MODEL_LOAD_PATH) if (self.event == Scenario.PEDESTRIAN): model = DQN.load(MODEL_LOAD_PATH) obs = env.reset() count = 0 success = 0 while count < 500: done = False while not done: action, _ = model.predict(obs) print("Action taken:", RLDecision(action)) obs, reward, done, info = env.step(action) # print("Reward",reward) count += 1 if info[0]["success"]: success += 1 print("Count ", count, "Success ", success, "Success Rate:", success * 100 / float(count), "%") print("Success Rate ", success / count, success, count)
def train_multiple(cfg, version, trained_model, double_agent=False): # double_agent refers to both agents having learned in multi environment if double_agent: gym_wrapper = MultiAgentCustomEnv(cfg) # model_trained = DQN.load("{0}models/{1}".format("./", trained_model), env=gym_wrapper) model_trained = DQN.load("{0}models/{1}".format( cfg["study_results"], trained_model), env=gym_wrapper) else: gym_wrapper = CustomEnv(cfg) # model_trained = DQN.load("{0}models/{1}".format("./", trained_model), env=gym_wrapper) model_trained = DQN.load("{0}models/{1}".format( cfg["study_results"], trained_model), env=gym_wrapper) gym_wrapper = MultiAgentCustomEnv(cfg, model_trained, single=not double_agent) model = DQN(MlpPolicy, gym_wrapper, verbose=1, double_q=cfg["double-dqn"], prioritized_replay=cfg["prioritized"], policy_kwargs=dict(dueling=cfg["dueling"]), exploration_fraction=cfg["exploration_frac"], tensorboard_log=cfg["study_results"] + "tensorboard/experiments/") model.learn(total_timesteps=cfg["timesteps"], tb_log_name=cfg["experiment_name"]) model.save("{0}models/{2}-v{1}".format(cfg["study_results"], version, cfg["experiment_name"]))
def get_environment_figures(model, *, source_folder=TRAINED_MODEL_FOLDER_DOCKER, vector=False): if not vector: try: model_to_load = source_folder + model trained_model = DQN.load(model_to_load, ENV_DISP) except Exception as e: try: source_folder = TRAINED_MODEL_FOLDER_LOCAL model_to_load = source_folder + model trained_model = DQN.load(model_to_load, ENV_DISP) except Exception as e: print("Failed to load model.") print( "If model is not inside the trained_model folder, override the source_folder to match the desired folder" ) print(str(e)) os._exit(0) # Show the result of the training obs = ENV_DISP.reset() for episode in range(1): done = False while not done: action, _states = trained_model.predict(obs) obs, rewards, done, info = ENV_DISP.step(action) fig_xy = ENV_DISP.get_xy_plane_figure() fig_xz = ENV_DISP.get_xz_plane_figure() fig_3d = ENV_DISP.get_3d_figure() return fig_xy, fig_xz, fig_3d else: print("Vectorized env not implemented yet")
def launchAgent(): from stable_baselines import DQN import Reinforcement_AI.env.c_seperate_env as sep_env from queue import Queue from threading import Thread minimap_env = sep_env.MinimapEnv() allenv = sep_env.AllEnv() minimap_model = DQN( "CnnPolicy", # policy minimap_env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) allenv_model = DQN( "MlpPolicy", allenv, double_q=True, prioritized_replay=True, verbose=0 ) for i in range(100): if i != 0: minimap_model = DQN.load("KR_minimap_" + str(i)) allenv_model = DQN.load("KR_allenv_" + str(i)) que = Queue() minimap_model.set_env(minimap_env) allenv_model.set_env(allenv) # minimap_thread = Thread(target=minimap_model.learn, args=[50000]) # allenv_thread = Thread(target=allenv_model.learn, args=[50000]) allenv_thread = Thread(target=lambda q, arg1: q.put(allenv_model.learn(arg1)), args=(que, 50000)) # test = Pool(processes=1) # minimap_thread.start() allenv_thread.start() # test_result = test.apply_async(allenv_model.learn, (50000, None, 100, "DQN", True, None)) minimap_model.learn(total_timesteps=50000) # allenv_model.learn(total_timesteps=50000) # minimap_thread.join() allenv_thread.join() allenv_model = que.get() # return_val = test_result.get() minimap_model.save("KR_minimap_" + str(i + 1)) allenv_model.save("KR_allenv_" + str(i + 1))
def pre_trained_model(env): model = DQN( CnnPolicy, env, # tensorboard_log=graph_path, double_q=True, prioritized_replay=True, prioritized_replay_alpha=0.99, learning_starts=MAX_STEPS * 0.5, verbose=1) model.load(pre_trained_path) return model
def main(mode="train"): env = gym.make("balancebot-v0") if mode == "train": model = deepq(policy=LnMlpPolicy, env=env, double_q=True, prioritized_replay=True, learning_rate=1e-3, buffer_size=100, verbose=0, tensorboard_log="dqn_balancebot_tensorboard") model.learn( total_timesteps=2000, callback=callback ) print("Saving model to balance_dqn.pkl") model.save("balance_dqn.pkl") del model # remove to demonstrate saving and loading if mode == "test": model = deepq.load("balance_dqn.pkl") obs = env.reset() done = False env.set_done(5000) while not done: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # env.render() print(obs)
def LunarLander_v2_DQN(): #TODO : 报错 # Create environment env = gym.make('LunarLander-v2') # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=100000) # Save the agent model.save("dqn_lunar") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("dqn_lunar") # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print(mean_reward, std_reward) # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def predict_with_DQN(): simulation_start_time = time.time() model = DQN.load("deepq_jobshop") scores = [] # list of final scores after each episode episodes = 1 # 30 max_periods = 8000 # 8000 for episode in range(episodes): # Reset the game-state, done and score before every episode next_state = env.reset() score = 0 for period in range(max_periods): # predict for x periods action, _states = model.predict(next_state) next_state, reward, done, info = env.step(action) score += reward scores.append(score) print("Episode: {}/{}, score: {}".format(episode + 1, episodes, score)) # print("Observation space at the end: " + str(next_state)) print("Prediction finished after " + str(round(time.time() - simulation_start_time, 4)) + " seconds") print("Final average score over " + str(episodes) + " episodes: " + str(mean(scores))) return scores
def initialize(self, model_path: str): """ loads the model Args: :param model_path: (str) the path to the model """ self.neural_network = DQN.load(model_path)
def testLookAheadAgent(env, original_env, agent): agent_path = 'trained-agents/{0}'.format(agent) model = DQN.load(load_path=agent_path, env=env) state = env.reset() writeLineToFile('Step; State; Reward', csv_output_filename) time.sleep( 5 ) # pra dar tempo de iniciar os fluxos e não ficar com active_flows vazio while True: active_flows = original_env.getActiveFlows() for flow in active_flows: if original_env.isElephantFlow(flow): state = original_env.getState() action, _ = model.predict(state, deterministic=False) state, reward, done, info = env.step(action, flow) output_data_line = '{0}; {1}; {2}'.format(step, state, reward) writeLineToFile(output_data_line, csv_output_filename) print() time.sleep(1)
def testModel(model_path, episodes, max_timesteps_per_episode, sqil=True): """ runs a model "episodes" iterations and collects rewards for each episode each episode stops at max_timesteps_per_episode or earlyier """ env = gym.make("cartpole_custom-v0") model = None if sqil: model = SQIL_DQN.load(model_path) else: model = DQN.load(model_path) rewards = np.zeros((episodes)) for i in range(0, episodes): obs = env.reset() timestep = 0 done = False while not done: timestep += 1 action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) rewards[i] += reward if timestep > max_timesteps_per_episode: break return rewards
def main(): num = len(sys.argv) if(num == 2): games = int(sys.argv[1]) else: print("python ",sys.argv[0]," (number of games)") sys.exit(1) env = TetrisEnv() model = DQN.load("tetris_model") total_model_score = 0 total_random_score = 0 for _ in range(games): obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) current_board = env.game.get_board() for board in env.game.board_list: if np.array_equal(np.array(current_board),np.array(board)): obs, rewards, dones, info = env.step(3) #move down if repeated break if(dones): total_model_score += env.game.score break #using random actions obs = env.reset() while True: action= env.action_space.sample() obs, rewards, dones, info = env.step(action) if(dones): total_random_score += env.game.score break print("Final avg score using model: ", total_model_score/games) print("Final avg score using random actions: ", total_random_score/games)
def testAgent(env, original_env, agent, timesteps): num_steps = int(timesteps) agent_path = 'trained-agents/{0}'.format(agent) model = DQN.load(load_path=agent_path, env=env) state = env.reset() writeLineToFile('Step; State; Reward', csv_output_filename) for step in range(num_steps): print('Step ', step) print('State ', state) action, _ = model.predict(state, deterministic=False) print('Action: ', action) state, reward, done, info = env.step(action) extra_info = info[0] is_action_for_elephant_flow = extra_info['is_action_for_elephant_flow'] flow_label = 'EF' if is_action_for_elephant_flow else 'MF' print('Flow: ', flow_label) output_data_line = '{0}; {1}; {2}; {3}'.format(step, state, reward, flow_label) writeLineToFile(output_data_line, csv_output_filename) step += 1
def setup(): global e, model, obs p.size(800, 600) e = Env(width, height) model = DQN.load("ball_blast_dqn_25000") obs = e.reset()
def main(): env = make_env() max_screen_x = 0 model = DQN.load(saved_model_file_path) obs = env.reset() fps = 60 frames_per_timestep = 4 speed_up_factor = 1.5 wait_time = frames_per_timestep / fps / speed_up_factor while True: t1 = time.time() action, _states = model.predict(obs) t2 = time.time() t3 = wait_time - (t2 - t1) if t3 > 0: time.sleep(t3) obs, rewards, done, info = env.step(action) if info['screen_x'] > max_screen_x: max_screen_x = info['screen_x'] logger.info("Max screen x: " + str(max_screen_x)) if done: env.reset() else: env.render()
def load_model(model_dir, model_type="PPO"): #policy = MlpPolicy if model_type == "PPO": model = PPO2.load(model_dir) elif model_type == "DQN": model = DQN.load(model_dir) return model
class Agent: model = DQN.load("best_model.pkl") def __init__(self): self.name = "DQNAgent" def act(self, stateObs, actions): action, _states = self.model.predict(stateObs) return action
def run(model_name, iteration, world, stage): world_stage = 'SuperMarioBros-{}-{}-v2'.format(world, stage) env = gym_super_mario_bros.make(world_stage) env = JoypadSpace(env, RIGHT_ONLY) env = WarpFrame(env) env = FrameStack(env, n_frames=4) env = EpisodicLifeEnv(env) # env = MaxAndSkipEnv(env) # Save a checkpoint every 1000 steps checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./logs/', name_prefix=model_name) eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=10000, deterministic=True, render=False) print("Compiling model...") steps = 10000 if iteration > 0: model = DQN.load('models/{}'.format(model_name), env=env, verbose=1, learning_starts=2500, learning_rate=1e-4, exploration_final_eps=0.01, prioritized_replay=True, prioritized_replay_alpha=0.6, train_freq=4, tensorboard_log="./mario_tensorboard/") else: model = DQN(CnnPolicy, env, verbose=1, learning_starts=2500, learning_rate=1e-4, exploration_final_eps=0.01, prioritized_replay=True, prioritized_replay_alpha=0.6, train_freq=4, tensorboard_log="./mario_tensorboard/") print("Training starting...") with ProgressBarManager(steps) as progress_callback: model.learn( total_timesteps=steps, # , eval_callback, checkpoint_callback], callback=[progress_callback], tb_log_name=model_name) print("Finished training model on env...\n") model.save("models/{}".format(model_name))
def loader(algo, env_name): if algo == 'dqn': return DQN.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'ppo2': return PPO2.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'a2c': return A2C.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'acer': return ACER.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'trpo': return TRPO.load("trained_agents/" + algo + "/" + env_name + ".pkl")
def predict(self, symbol, loadpath=None, sd=dt.datetime(2018, 1, 29), ed=dt.datetime(2019, 12, 18), fwd=False): # update data dp.pull(symbol, should_save=True) dp.pull('SPY', should_save=True) # load data and add phantom SPY trading day df = self._load_data([symbol], sd, ed) if fwd: lastspy = df.loc['SPY'].tail(1).copy() lastspy.index = lastspy.index.shift(1, freq='D') lastspy['Symbol'] = 'SPY' lastspy = lastspy.reset_index().set_index(['Symbol', 'Date']) df = df.append(lastspy).sort_index() # load model and predict for test range self.model = DQN.load(loadpath) if fwd: chgs = np.linspace(-0.5, 0.5, num=101) pxs = chgs + df.loc[symbol].tail(1).copy().AdjClose.values[0] pxchgs = np.zeros((101, )) actions = np.zeros((101, )) for i, px in enumerate(pxs): last = df.loc[symbol].tail(1).copy() last.index = last.index.shift(1, freq='D') pxchgs[i] = px / last.AdjClose - 1 last.AdjClose = px last.Close = px last['Symbol'] = symbol last = last.reset_index().set_index(['Symbol', 'Date']) df_tmp = df.append(last).sort_index() # predict df_met = self._get_indicators(symbol, df_tmp) ob = df_met.tail(1).drop(['Date', 'AdjClose'], axis=1) action, _ = self.model.predict(ob) actions[i] = action df_preds = pd.DataFrame({ 'Price': pxs, 'Chg': pxchgs, 'Action': actions }) return df_preds else: df_met = self._get_indicators(symbol, df) ob = df_met.tail(1).drop(['Date', 'AdjClose'], axis=1) action, _ = self.model.predict(ob) return action
def main(log_dir=None, name_results_root_folder="results"): args = parseArgs() time_steps = TIME_STEPS # if log_dir doesnt created,use defaul one which contains the starting time of the training. if log_dir is None: if args.restart_training: # find the latest training folder latest_log_dir = os.path.join( name_results_root_folder, sorted(os.listdir(name_results_root_folder))[-1]) logdir = latest_log_dir else: defaul_log_dir = os.path.join(name_results_root_folder, "DQN_" + getTimeStr()) os.makedirs(defaul_log_dir, exist_ok=True) logdir = defaul_log_dir else: logdir = log_dir reward_bound = REWARD_BOUND # get arena environments and custom callback env = Monitor(Arena2dEnvWrapper(0, True), os.path.join(logdir, "arena_env0")) # env = Arena2dEnvWrapper(0, True) call_back = SaveOnBestTrainingRewardCallback(500, logdir, 1, reward_bound) # set temporary model path, if training was interrupted by the keyboard, the current model parameters will be saved. path_temp_model = os.path.join(logdir, "DQN_TEMP") if not args.restart_training: model = DQN(MlpPolicy, env, gamma=GAMMA, learning_rate=LEARNING_RATE, buffer_size=BUFFER_SIZE, target_network_update_freq=SYNC_TARGET_STEPS, tensorboard_log=logdir, verbose=1) reset_num_timesteps = True else: if os.path.exists(path_temp_model + ".zip"): print("continue training the model...") model = DQN.load(path_temp_model, env=env) reset_num_timesteps = False else: print( "Can't load the model with the path: {}, please check again!". format(path_temp_model)) env.close() exit(-1) # try: model.learn(time_steps, log_interval=200, callback=call_back, reset_num_timesteps=reset_num_timesteps) model.save(os.path.join(logdir, "DQN_final"))
def get_existing_model(model_path): print('--- Training from existing model', model_path, '---') # Load model model = DQN.load(model_path, tensorboard_log=TENSORBOARD_PATH) # Set environment env = QWOPEnv() # SubprocVecEnv([lambda: QWOPEnv()]) model.set_env(env) return model
def launchAgent(model_name: str): """ :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함 현재는 의도상 PPO2로 세팅할 것 :return: 1000회의 사이클을 돌고 난 이후의 모델 """ import Reinforcement_AI.env.e_enhanced_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env print("Current Env is " + model_name) if model_name == "HER": env = image_env.DetailedMiniMapEnv() model = HER("CnnPolicy", env=env, model_class=DQN) if model_name == "DDPG": env = image_env.DDPGImageEnv() model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True) if model_name == "PPO2": env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2(policy="CnnPolicy", env=env, verbose=1) else: env = image_env.DetailedMiniMapEnv() model = DQN( "CnnPolicy", # policy env=env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i), env) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i), env) if model_name == "PPO2": model = PPO2.load("detailedmap_PPO2_" + str(i), env) else: model = DQN.load("detailedmap_DQN_" + str(i), env) # print('model learn start') model.learn(total_timesteps=12500) #FPS가 130이상 넘어갈때의 최소수치 print("this model is : detailedmap_" + model_name + "_" + str(i + 1)) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i + 1)) del model # print('model save end') return model
def run_sonobuoy_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'sonobuoy_training' else: writer = None tb_log_name = None env = gym.make('plark-env-v0', panther_agent_filepath='/data/agents/models/PPO2_20200429_073132_panther/') if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def run_illegal_move_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'Illegal_move_prevention_training' else: writer = None tb_log_name = None if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) # env = plark_env_illegal_move.PlarkEnvIllegalMove( config_file_path='/Components/plark-game/plark_game/game_config/10x10/balanced.json') env = gym.make('plark-env-illegal-move-v0') if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model env = gym.make('plark-env-illegal-move-v0') model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def run(): env = DummyVecEnv([lambda: DemoEnv()]) model = DQN.load("deepq_DemoEnv", env) obs = env.reset() sum_rew = 0 while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() sum_rew += rewards[0] if dones[0] == True: print("Total reward: ", sum_rew) break
def __init__(self, path, deterministic=False, tau=0.05): self.turn_model = DQN.load(path + "model_turn") self.speed_model = DQN.load(path + "model_speed") dic = loadConfig(path + "parameters.json") self.tau = tau self.deterministic = deterministic self.turn_bins = np.linspace( -dic["max_turn"], dic["max_turn"], dic["turn_bins"] ) self.speed_bins = np.linspace( dic["min_speed"], dic["max_speed"], dic["speed_bins"] ) self.raycast_options = { "n_fish_bins": dic["num_bins_rays"], "n_wall_raycasts": dic["num_bins_rays"], "fov_angle_fish_bins": np.radians(dic["degreees"]), "fov_angle_wall_raycasts": np.radians(dic["degrees"]), "world_bounds": ([-50, -50], [50, 50]), }
def loadAgent(self, filepath, algorithm_type): try: if algorithm_type.lower() == 'dqn': self.model = DQN.load(filepath) elif algorithm_type.lower() == 'ppo2': self.model = PPO2.load(filepath) elif algorithm_type.lower() == 'a2c': self.model = A2C.load(filepath) elif algorithm_type.lower() == 'acktr': self.model = ACKTR.load(filepath) except: raise ValueError('Error loading pelican agent. File : "' + filepath + '" does not exsist')
def main(): # create the environment env = gym.make("gym_balanceBot-v0") if os.path.isfile("trained_model/dqn_balanceBot.zip") == False: # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=int(2e5)) # Save the agent model.save("trained_model/dqn_balanceBot") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("trained_model/dqn_balanceBot") # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) else: # Load the trained agent model = DQN.load("trained_model/dqn_balanceBot") # Enjoy trained agent obs = env.reset() for i in range(3000): action, states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() sleep(1. / 240.) env.close()