def LunarLander_v2_DQN(): #TODO : 报错 # Create environment env = gym.make('LunarLander-v2') # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=100000) # Save the agent model.save("dqn_lunar") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("dqn_lunar") # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print(mean_reward, std_reward) # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def run_train(self): env = CustomEnv(self.path_planner, self.behavior_planner, event) env = make_vec_env(lambda: env, n_envs=1) model = None if self.event == Scenario.LANE_CHANGE: model = DQN(CustomLaneChangePolicy, env, verbose=1, learning_starts=256, batch_size=256, exploration_fraction=0.9, target_network_update_freq=100, tensorboard_log=dir_path + '/Logs/') if self.event == Scenario.PEDESTRIAN: model = DQN(CustomPedestrianPolicy, env, verbose=1, learning_starts=256, batch_size=256, exploration_fraction=0.9, target_network_update_freq=100, tensorboard_log=dir_path + '/Logs/Ped', gamma=0.93, learning_rate=0.0001) model.learn(total_timesteps=20000) model.save(MODEL_SAVE_PATH)
def train(log_dir, model_dir, env_name, train_timesteps=2500): #make sure dir exists os.makedirs(log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_name) # Logs will be saved in log_dir/monitor.csv env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = DQN(MlpPolicy, env, verbose=1) # Train the agent model.learn(total_timesteps=train_timesteps) # Save the agent if not model_dir.endswith("/"): model_dir += "/" model.save(str(model_dir) + "dqn_" + str(env_name) + "_trained_timesteps_" + str(train_timesteps)) # delete trained model del model
def train(): # Load Model env = gym.make('roundabout-v0') model = DQN(MlpPolicy, env, verbose=1) generate_expert_traj(model, 'expert_roundabout', n_timesteps=1000, n_episodes=10) #Data Augmentation expert_data = dict(np.load('expert_roundabout.npz')) print("my keys are:" + str(expert_data.keys())) obs = expert_data['obs'] obs.shape expert_data['obs'] = obs.ravel() # convert to 1D array print("my keys are:" + str(expert_data.keys())) np.savez('expert_roundabout.npz', expert_data) dataset = ExpertDataset(expert_path='expert_roundabout.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1) model.learn(total_timesteps=1000) model.save("gail_roundabout") env.close() del env
def train_multiple(cfg, version, trained_model, double_agent=False): # double_agent refers to both agents having learned in multi environment if double_agent: gym_wrapper = MultiAgentCustomEnv(cfg) # model_trained = DQN.load("{0}models/{1}".format("./", trained_model), env=gym_wrapper) model_trained = DQN.load("{0}models/{1}".format( cfg["study_results"], trained_model), env=gym_wrapper) else: gym_wrapper = CustomEnv(cfg) # model_trained = DQN.load("{0}models/{1}".format("./", trained_model), env=gym_wrapper) model_trained = DQN.load("{0}models/{1}".format( cfg["study_results"], trained_model), env=gym_wrapper) gym_wrapper = MultiAgentCustomEnv(cfg, model_trained, single=not double_agent) model = DQN(MlpPolicy, gym_wrapper, verbose=1, double_q=cfg["double-dqn"], prioritized_replay=cfg["prioritized"], policy_kwargs=dict(dueling=cfg["dueling"]), exploration_fraction=cfg["exploration_frac"], tensorboard_log=cfg["study_results"] + "tensorboard/experiments/") model.learn(total_timesteps=cfg["timesteps"], tb_log_name=cfg["experiment_name"]) model.save("{0}models/{2}-v{1}".format(cfg["study_results"], version, cfg["experiment_name"]))
def train_DQN(env_train, model_name, timesteps=50000): start = time.time() model = DQN('MlpPolicy', env_train, verbose=1) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DQN): ', (end - start) / 60, ' minutes') return model
def train(): env = DummyVecEnv([ lambda: DemoEnv() ]) # DQN does not support parrelization through SubprocVecEnv model = DQN(MlpPolicy, env, verbose=1, policy_kwargs={'layers': [4]}) model.learn(total_timesteps=int(2e5)) model.save("deepq_DemoEnv") env.close() del model
def train(): machine = StateMachine() machine.initialize(headless=True) camera = Camera(machine) env = CustomEnv(machine, camera, state="vision") model = DQN(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \ exploration_fraction=0.3, target_network_update_freq=32, tensorboard_log=dir_path+'/Logs/') model.learn(total_timesteps=1000, log_interval=1000000) model.save("Grasp_Model_1")
def run(model_name, iteration, world, stage): world_stage = 'SuperMarioBros-{}-{}-v2'.format(world, stage) env = gym_super_mario_bros.make(world_stage) env = JoypadSpace(env, RIGHT_ONLY) env = WarpFrame(env) env = FrameStack(env, n_frames=4) env = EpisodicLifeEnv(env) # env = MaxAndSkipEnv(env) # Save a checkpoint every 1000 steps checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./logs/', name_prefix=model_name) eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=10000, deterministic=True, render=False) print("Compiling model...") steps = 10000 if iteration > 0: model = DQN.load('models/{}'.format(model_name), env=env, verbose=1, learning_starts=2500, learning_rate=1e-4, exploration_final_eps=0.01, prioritized_replay=True, prioritized_replay_alpha=0.6, train_freq=4, tensorboard_log="./mario_tensorboard/") else: model = DQN(CnnPolicy, env, verbose=1, learning_starts=2500, learning_rate=1e-4, exploration_final_eps=0.01, prioritized_replay=True, prioritized_replay_alpha=0.6, train_freq=4, tensorboard_log="./mario_tensorboard/") print("Training starting...") with ProgressBarManager(steps) as progress_callback: model.learn( total_timesteps=steps, # , eval_callback, checkpoint_callback], callback=[progress_callback], tb_log_name=model_name) print("Finished training model on env...\n") model.save("models/{}".format(model_name))
def launchAgent(): from stable_baselines import DQN import Reinforcement_AI.env.c_seperate_env as sep_env from queue import Queue from threading import Thread minimap_env = sep_env.MinimapEnv() allenv = sep_env.AllEnv() minimap_model = DQN( "CnnPolicy", # policy minimap_env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) allenv_model = DQN( "MlpPolicy", allenv, double_q=True, prioritized_replay=True, verbose=0 ) for i in range(100): if i != 0: minimap_model = DQN.load("KR_minimap_" + str(i)) allenv_model = DQN.load("KR_allenv_" + str(i)) que = Queue() minimap_model.set_env(minimap_env) allenv_model.set_env(allenv) # minimap_thread = Thread(target=minimap_model.learn, args=[50000]) # allenv_thread = Thread(target=allenv_model.learn, args=[50000]) allenv_thread = Thread(target=lambda q, arg1: q.put(allenv_model.learn(arg1)), args=(que, 50000)) # test = Pool(processes=1) # minimap_thread.start() allenv_thread.start() # test_result = test.apply_async(allenv_model.learn, (50000, None, 100, "DQN", True, None)) minimap_model.learn(total_timesteps=50000) # allenv_model.learn(total_timesteps=50000) # minimap_thread.join() allenv_thread.join() allenv_model = que.get() # return_val = test_result.get() minimap_model.save("KR_minimap_" + str(i + 1)) allenv_model.save("KR_allenv_" + str(i + 1))
def train_dqn(timesteps, name): env = datares_roulette env = DummyVecEnv([env]) model = DQN( stable_baselines.deepq.policies.MlpPolicy, env, verbose=1, ) model.learn(total_timesteps=timesteps) model.save(name) return model
def main(log_dir=None, name_results_root_folder="results"): args = parseArgs() time_steps = TIME_STEPS # if log_dir doesnt created,use defaul one which contains the starting time of the training. if log_dir is None: if args.restart_training: # find the latest training folder latest_log_dir = os.path.join( name_results_root_folder, sorted(os.listdir(name_results_root_folder))[-1]) logdir = latest_log_dir else: defaul_log_dir = os.path.join(name_results_root_folder, "DQN_" + getTimeStr()) os.makedirs(defaul_log_dir, exist_ok=True) logdir = defaul_log_dir else: logdir = log_dir reward_bound = REWARD_BOUND # get arena environments and custom callback env = Monitor(Arena2dEnvWrapper(0, True), os.path.join(logdir, "arena_env0")) # env = Arena2dEnvWrapper(0, True) call_back = SaveOnBestTrainingRewardCallback(500, logdir, 1, reward_bound) # set temporary model path, if training was interrupted by the keyboard, the current model parameters will be saved. path_temp_model = os.path.join(logdir, "DQN_TEMP") if not args.restart_training: model = DQN(MlpPolicy, env, gamma=GAMMA, learning_rate=LEARNING_RATE, buffer_size=BUFFER_SIZE, target_network_update_freq=SYNC_TARGET_STEPS, tensorboard_log=logdir, verbose=1) reset_num_timesteps = True else: if os.path.exists(path_temp_model + ".zip"): print("continue training the model...") model = DQN.load(path_temp_model, env=env) reset_num_timesteps = False else: print( "Can't load the model with the path: {}, please check again!". format(path_temp_model)) env.close() exit(-1) # try: model.learn(time_steps, log_interval=200, callback=call_back, reset_num_timesteps=reset_num_timesteps) model.save(os.path.join(logdir, "DQN_final"))
def traindqn(args): # with tf.device('/device:CUDA:1'): with tf.device('/gpu:0'): env = gym.make('python_1p-v0') # env = Monitor(env, filename=None, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = DQN(DqnCnnPolicy, env, verbose=1,learning_rate=0.0001, exploration_fraction=0.4, train_freq=10) model.learn(5000000) model.save("dqnwithcnn.pth")
def train_agent(agent): # Get the parameters (the common ones) environment_var = get_recipe_config()['environment'] agent_var = get_recipe_config()['agent'] policy_var = get_recipe_config()['policy'] gamma_var = get_recipe_config()['gamma'] lr_var = get_recipe_config()['dqn_learning_rate'] training_episodes_var = 5000 # Create the JSON FILE and dump it into the output folder training_infos = { 'name': environment_var, 'agent': agent_var, 'type': 'OpenAI Gym', 'num_episodes': training_episodes_var, 'lr': lr_var, 'gamma': gamma_var, 'policy': policy_var, 'training_date': str(datetime.datetime.now()) } saved_models = dataiku.Folder(get_output_names_for_role('main_output')[0]) saved_models_info = saved_models.get_info() saved_models_path = saved_models.get_path() with open(saved_models_path + '/training_infos.json', 'w') as fp: json.dump(training_infos, fp) # Choose the agent if agent == "dqn": from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines.deepq.policies import CnnPolicy from stable_baselines import DQN model = DQN(policy = policy_var, env = environment_var, gamma = gamma_var, learning_rate = lr_var) # Start the training and dump the model into the output folder print("========================== Start Training ==========================") model.learn(training_episodes_var) model_name = agent_var + "_" + environment_var print("Model Saved") model.save(saved_models_path + "/" + model_name)
def run_model(algorithm, training_timesteps, testing_timesteps, training_iterations, testing_iterations, learning_rate, batch_size): model = DQN(CustomPolicy, env, learning_rate=learning_rate, batch_size=batch_size) for k in range(training_iterations): model.learn(total_timesteps=int(training_timesteps)) model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname)) subprocess.Popen(path_for_kill_file, shell=True) for j in range(testing_iterations): # Load the trained agent model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname)) # Reset the environment obs = env.reset() # Create an empty list to store reward values final_rewards = [] for _ in range(testing_timesteps): # predict the values action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones == True: final_rewards.append(rewards) # Print the mean reward print(np.mean(final_rewards)) # Print the standard deviation of reward print(np.std(final_rewards)) # Create a DataFrame to save the mean and standard deviation df = df.append( { 'Mean Rewards': np.mean(final_rewards), 'Standard deviation': np.std(final_rewards) }, ignore_index=True) df.to_csv("{}_{}_{}".format(1, algorithm, "MeanAndStdReward.csv", sep=',', index=True)) subprocess.Popen(path_for_kill_file, shell=True) subprocess.Popen(path_for_kill_file, shell=True)
class DqnController: """ Implements an RL (DQN) controller """ def __init__(self, env): """ :param: env: a thermostat environment """ self.env = env self.model = DQN(MlpPolicy, env, verbose=1, tensorboard_log="./dqn_thermostat_tensorboard/") @staticmethod def name(): return "Dqn" def train(self): self.model.learn(total_timesteps=50000) def save(self): self.model.save("dqn.pk") def load(self): self.model = None self.model = DQN.load("dqn.pk") def simulate(self): state = self.env.reset() cumulative_reward = 0.0 P_consumed = [] done = False while not done: action, _state = self.model.predict(state) state, reward, done, info = self.env.step(action) cumulative_reward += reward P_consumed.append(action) print("MSE Setpoint- realized: %.3f - Energy consumed: %.2f" % (cumulative_reward, sum(P_consumed))) result_folder = "results/" + self.name( ) + "/" + self.env.start_date.strftime( "%m-%d-%Y") + "_to_" + self.env.end_date.strftime("%m-%d-%Y") self.env.store_and_plot(result_folder) def set_env(self, env): self.env = env
def main(): parser = argparse.ArgumentParser() parser.add_argument("--algorithm") parser.add_argument("--env") parser.add_argument("--steps") parser.add_argument("--alpha") parser.add_argument("--grid_search") args = parser.parse_args() algorithm = args.algorithm env = gym.make(args.env) grid_search = args.grid_search alpha = args.alpha if algorithm == "ppo1": from stable_baselines import PPO1 from stable_baselines.common.policies import MlpPolicy model = PPO1(MlpPolicy, env, verbose=1) else: from stable_baselines import DQN from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1) model.learn(total_timesteps=int(args.steps), log_interval=10) model.save(f"{algorithm}_cartpole") del model # remove to demonstrate saving and loading if algorithm == "ppo1": model = PPO1.load(f"{algorithm}_cartpole") else: model = DQN.load(f"{algorithm}_cartpole") mean_reward = evaluate(model, env, num_steps=10000) hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}" if grid_search: with open("grid_search_results.txt", "a") as myfile: myfile.write(str(mean_reward) + hparams_str) myfile.close() else: print(str(mean_reward) + hparams_str)
def trainAgent(env): model = DQN( env=env, policy=MlpPolicy, verbose=1, learning_rate= 0.05, # alpha: If your learning rate is set too low, training will progress very slowly as you are making very tiny updates to the weights in your network. However, if your learning rate is set too high, it can cause undesirable divergent behavior in your loss function. gamma= 0.95, # It controls the importance of the future rewards versus the immediate ones. exploration_initial_eps=1.0, exploration_fraction=0.9, exploration_final_eps=0.01, buffer_size=56, batch_size=50) model.learn(total_timesteps=700) model.save('./trained-agents/C1') print('Modelo treinado e salvo.')
def train(params): # setup config if params.get("policy") == 'mlp': policy = MlpPolicy env = gym.make(params.get("environment")) else: policy = CnnPolicy env = gym.make(params.get("environment")) env.configure(CNN_config) env.reset() exp_name = ("{0}_{1}_{2}".format(params.get("model_name"), params.get("policy"), params.get("environment"))) log_dir = './logs/' + exp_name # create model model = DQN( policy, env, verbose=1, tensorboard_log=log_dir, buffer_size=params.get("buffer_size"), learning_rate=params.get("learning_rate"), gamma=params.get("gamma"), target_network_update_freq=params.get("target_update_interval"), exploration_fraction=params.get("exploration_fraction"), exploration_final_eps=params.get("exploration_final_eps"), learning_starts=params.get("learning_starts"), batch_size=params.get("batch_size"), exploration_initial_eps=params.get("exploration_initial_eps"), double_q=True, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-06, train_freq=params.get("train_freq"), policy_kwargs=policy_kwargs) model.learn(total_timesteps=params.get("train_steps"), log_interval=10) model.save(exp_name) env.close() del env
def AirRaid_main(): env = retro.make('AirRaid-Atari2600', use_restricted_actions=retro.Actions.DISCRETE) model = DQN(CnnPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("AirRaid_Model") del model model = DQN.load("AirRaid_Model") obs = env.reset() while True: action, _states = model.predict(obs) obs, rew, done, info = env.step(action) #env.render() if done: obs = env.reset() env.close()
def train(algorithm='dqn', timesteps=2e5): # env = gym.make('LunarLander-v2') # This uses the library version of the Lunar Lander env. print('algorithm: ', algorithm) print('timesteps: ', timesteps) learning_rate = 0.001 if algorithm.lower() == 'dqn': env = LunarLander() model = DQN('MlpPolicy', env, learning_rate=learning_rate, prioritized_replay=True, verbose=1) elif algorithm.lower() == 'ppo2': n_envs = 4 env = SubprocVecEnv([lambda: LunarLander() for i in range(n_envs)]) schedule = LinearSchedule(int(float(timesteps)), 0.00001, 0.1).value model = PPO2('MlpPolicy', env, learning_rate=schedule, verbose=1) else: raise RuntimeError("Unknown algorithm. %s" % algorithm) # mean_reward, std_reward = evaluate_policy( # model, model.get_env(), n_eval_episodes=10) # Train the agent model.learn(total_timesteps=int(float(timesteps)), log_interval=10) # Save the agent model.save("trained_models/latest") now = datetime.now() dt_string = now.strftime("%Y-%m-%d_%H-%M-%S") model.save("trained_models/lunar_climber_%s-%s" % (algorithm.lower(), dt_string)) # #lot training progress # plt.plot(env.all_rewards) # plt.ylabel('Reward') # plt.xlabel('Timesteps') # plt.savefig('figures/stats-%s.png' % dt_string) print("Model trained!")
def sb_model_train(rl_manager): env = CustomEnv(rl_manager) env = make_vec_env(lambda: env, n_envs=1) model = DQN(CustomPolicy, env, verbose=1, learning_starts=256, batch_size=256, exploration_fraction=0.5, target_network_update_freq=10, tensorboard_log='./Logs/') # model = DQN(MlpPolicy, env, verbose=1, learning_starts=64, target_network_update_freq=50, tensorboard_log='./Logs/') # model = DQN.load("DQN_Model_SimpleSim_30k",env=env,exploration_fraction=0.1,tensorboard_log='./Logs/') model.learn(total_timesteps=10000) # model = PPO2(MlpPolicy, env, verbose=1,tensorboard_log="./Logs/") # model.learn(total_timesteps=20000) model.save(dir_path + "/DQN_Model_SimpleSim") # sb_model_test(rl_manager) return
def run_model(algorithm, training_timesteps, testing_timesteps, training_iterations, testing_iterations, learning_rate, batch_size): columns = ['Mean Rewards', 'Standard deviation'] df = pd.DataFrame(columns=columns) if (algorithm == "PPO2"): from stable_baselines.common.policies import MlpPolicy model = PPO2(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname), n_steps = batch_size) else: from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname), batch_size = batch_size) for k in range(training_iterations): # Train the agent model.learn(total_timesteps=int(training_timesteps)) # Saving the model model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname)) subprocess.Popen(path_for_kill_file, shell=True) for j in range(testing_iterations): # Load the trained agent if (algorithm == "PPO2"): model = PPO2.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname)) else: model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname)) # Reset the environment obs = env.reset() # Create an empty list to store reward values final_rewards = [] for _ in range(testing_timesteps): # predict the values action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones == True: final_rewards.append(rewards) # Print the mean reward print(np.mean(final_rewards)) # Print the standard deviation of reward print(np.std(final_rewards)) # Create a DataFrame to save the mean and standard deviation df = df.append({'Mean Rewards': np.mean(final_rewards), 'Standard deviation': np.std(final_rewards)}, ignore_index=True) df.to_csv("{}_{}_{}".format(algorithm, hostname, "MeanAndStdReward.csv", sep=',',index=True)) subprocess.Popen(path_for_kill_file, shell=True) subprocess.Popen(path_for_kill_file, shell=True)
def trainAgent(env, agent): model = DQN( env=env, policy=MlpPolicy, verbose=1, learning_rate= 0.1, # alpha: If your learning rate is set too low, training will progress very slowly as you are making very tiny updates to the weights in your network. However, if your learning rate is set too high, it can cause undesirable divergent behavior in your loss function. gamma= 0.95, # It controls the importance of the future rewards versus the immediate ones. exploration_initial_eps=1.0, exploration_fraction=0.8, exploration_final_eps=0.1, buffer_size=56, batch_size=50) # treinamento com 5 fluxos de 300M agent_string = 'DQN-flow-byte-count-' + agent model.learn(total_timesteps=10000) #5000 model.save('./trained-agents/' + agent_string) print('Modelo treinado e salvo: ', agent_string)
def traindqn(args): ''' An example with an agent which requires a single agent setup ''' with tf.device('/gpu:0'): env = gym.make('python_1p-v0') env = SAhandler(env) model = DQN(DqnCnnPolicy, env, verbose=1, learning_rate=5e-4, exploration_fraction=0.1, exploration_final_eps=0.01, buffer_size=50000, train_freq=1, prioritized_replay=True, target_network_update_freq=1000) model.learn(int(1e6)) model.save("dqnwithcnn", cloudpickle=True)
def train_single(cfg, version, load_model=None): gym_wrapper = CustomEnv(cfg) if load_model is None: model = DQN(MlpPolicy, gym_wrapper, verbose=1, double_q=cfg["double-dqn"], prioritized_replay=cfg["prioritized"], policy_kwargs=dict(dueling=cfg["dueling"]), exploration_fraction=cfg["exploration_frac"]) # tensorboard_log=cfg["study_results"] + "tensorboard/experiments/") else: model = DQN.load("{}models/single_dqn_transport".format( cfg["study_results"]), env=gym_wrapper) model.learn(total_timesteps=cfg["timesteps"], tb_log_name=cfg["experiment_name"]) model.save("{0}models/{2}-v{1}".format(cfg["study_results"], version, cfg["experiment_name"]))
def run(): # hyperparameters gamma = 0.99 #discount factor learning_rate = 0.00025 #learning rate for adam optimizer buffer_size = 50000 #size of the replay buffer exploration_fraction = 0.1 #fraction of entire training period over which the exploration rate is annealed exploration_final_eps = 0.02 #final value of random action probability exploration_initial_eps = 1.0 #initial value of random action probability train_freq = 1 #update the model every train_freq steps. set to None to disable printing batch_size = 32 #size of a batched sampled from replay buffer for training double_q = True #whether to enable Double-Q learning or not. learning_starts = 100 #how many steps of the model to collect transitions for before learning starts timesteps = 1000 #2000 verbose = 1 env = gym.make('Boxoban-Train-v1') model = DQN(MlpPolicy, env, gamma=gamma, learning_rate=learning_rate, buffer_size=buffer_size, exploration_fraction=exploration_fraction, exploration_final_eps=exploration_final_eps, exploration_initial_eps=exploration_initial_eps, train_freq=train_freq, batch_size=batch_size, double_q=double_q, learning_starts=learning_starts, verbose=1) model.learn(total_timesteps=timesteps) model.save("trained_models/dqn_sokoban_model") # Enjoy trained agent obs = env.reset() print(model.action_probability(obs)) while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
def main(): # create the environment env = gym.make("gym_balanceBot-v0") if os.path.isfile("trained_model/dqn_balanceBot.zip") == False: # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=int(2e5)) # Save the agent model.save("trained_model/dqn_balanceBot") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("trained_model/dqn_balanceBot") # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) else: # Load the trained agent model = DQN.load("trained_model/dqn_balanceBot") # Enjoy trained agent obs = env.reset() for i in range(3000): action, states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() sleep(1. / 240.) env.close()
def leveltrain(self, from_level, to_level, env, timesteps, level_modelpath, tensorboard_logs_path): model = DQN('MlpPolicy', env, verbose=1, policy_kwargs=self.policy_kwargs, prioritized_replay=True, buffer_size=100000, learning_rate=0.0003, exploration_final_eps=0, tensorboard_log=tensorboard_logs_path) model.save(level_modelpath) for current_level in range( from_level, to_level + 1): # Train model for increasingly difficult levels env = gym.make('DeepWellEnvSpherlevel' + str(current_level) + '-v0') model = self.load(level_modelpath, tensorboard_logs_path) # Load previous model env_str = self.get_env_str(env) model.set_env(make_vec_env(env_str, n_envs=1)) model.learn(total_timesteps=timesteps, reset_num_timesteps=False, tb_log_name="TB_" + datetime.now().strftime('%d%m%y-%H%M') ) # Continue training previous model level_modelpath = level_modelpath[0:-1] + str( current_level) # Generate new name of newly trained model model.save(level_modelpath) # Save newly trained model print("====================== Level " + str(current_level) + " finished with " + str(timesteps) + " timesteps ==========================") return model
def train_DQN(): simulation_start_time = time.time() model = DQN(MlpPolicy, env, verbose=1, tensorboard_log="./gym_jobshop_tensorboard_logs/") custom_callback = CustomCallback() # model = MlpPolicy # Call Tensorboard logs from a terminal in folder "masterarbeit" (root folder of the project) # tensorboard --logdir ReinforcementLearning/gym_jobshop_tensorboard_logs/DQN_1 # keyboard input: was will der user machen # a) trainiere für x steps # 10000 # b) gebe aktuelle werte aus # dqn proba_step(aktuellster state vom environment als observation + 2 fixe observations # dqn step(s.o. model.learn(total_timesteps=10000, callback=custom_callback) model.save("deepq_jobshop") print("Training finished after " + str(round(time.time() - simulation_start_time, 4)) + " seconds") return