def LunarLander_v2_DQN(): #TODO : 报错 # Create environment env = gym.make('LunarLander-v2') # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=100000) # Save the agent model.save("dqn_lunar") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("dqn_lunar") # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print(mean_reward, std_reward) # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(): retro.data.Integrations.add_custom_path( os.path.join(SCRIPT_DIR, "custom_integrations")) print("PokemonRed-GameBoy" in retro.data.list_games( inttype=retro.data.Integrations.ALL)) env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL) print(env) print(env.action_space) time.sleep(3) env = make_vec_env(lambda: env, n_envs=1) # check_env(env, warn=True) time.sleep(3) model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.close()
def run_model(algorithm, training_timesteps, testing_timesteps, training_iterations, testing_iterations, learning_rate, batch_size): model = DQN(CustomPolicy, env, learning_rate=learning_rate, batch_size=batch_size) for k in range(training_iterations): model.learn(total_timesteps=int(training_timesteps)) model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname)) subprocess.Popen(path_for_kill_file, shell=True) for j in range(testing_iterations): # Load the trained agent model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname)) # Reset the environment obs = env.reset() # Create an empty list to store reward values final_rewards = [] for _ in range(testing_timesteps): # predict the values action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones == True: final_rewards.append(rewards) # Print the mean reward print(np.mean(final_rewards)) # Print the standard deviation of reward print(np.std(final_rewards)) # Create a DataFrame to save the mean and standard deviation df = df.append( { 'Mean Rewards': np.mean(final_rewards), 'Standard deviation': np.std(final_rewards) }, ignore_index=True) df.to_csv("{}_{}_{}".format(1, algorithm, "MeanAndStdReward.csv", sep=',', index=True)) subprocess.Popen(path_for_kill_file, shell=True) subprocess.Popen(path_for_kill_file, shell=True)
def main(): retro.data.Integrations.add_custom_path( os.path.join(SCRIPT_DIR, "custom_integrations")) print("PokemonRed-GameBoy" in retro.data.list_games( inttype=retro.data.Integrations.ALL)) env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL, use_restricted_actions=retro.Actions.DISCRETE) print(env) # print(env.action_space) # time.sleep(3) # env = make_vec_env(lambda: env, n_envs=1) # check_env(env, warn=True) # time.sleep(3) model = DQN(MlpPolicy, env, verbose=1) print("STARTING Training!!!") start_time = time.time() model.learn(total_timesteps=50000) print("TRAINING COMPLETE! Time elapsed: ", str(time.time() - start_time)) print("Attempting to get first pokemon!") start_time = time.time() printed_done = False sampled_info = False obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() if not sampled_info: print("Here's the info that the AI uses:\n") print("obs:\n", obs, "\n</obs>\n") print("rewards:\n", rewards, "\n</rewards>\n") print("dones:\n", dones, "\n</dones>\n") print("Info:\n", info, "\n</info>\n") sampled_info = True if dones and not printed_done: print("Success! time elapsed: ", str(time.time() - start_time)) printed_done = True env.close()
def optimize_agent(trial): env = wds(wds_name=hparams['env']['waterNet'] + '_master', speed_increment=hparams['env']['speedIncrement'], episode_len=hparams['env']['episodeLen'], pump_groups=hparams['env']['pumpGroups'], total_demand_lo=hparams['env']['totalDemandLo'], total_demand_hi=hparams['env']['totalDemandHi'], reset_orig_pump_speeds=hparams['env']['resetOrigPumpSpeeds'], reset_orig_demands=hparams['env']['resetOrigDemands']) model_params = optimize_dqn(trial) dict_layers = optimize_arch(trial) model = DQN(policy=CustomPolicy, policy_kwargs=dict_layers, env=env, verbose=0, train_freq=1, learning_starts=10000, buffer_size=350000, exploration_fraction=.95, exploration_final_eps=.0, param_noise=False, prioritized_replay=False, tensorboard_log=None, n_cpu_tf_sess=1, **model_params) model.learn(total_timesteps=1000000) rewards = [] n_episodes, reward_sum = 0, 0.0 env.randomize_demands() obs = env.reset(training=False) while n_episodes < 50: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action, training=False) if done: rewards.append(reward) n_episodes += 1 env.randomize_demands() obs = env.reset(training=False) mean_reward = np.mean(rewards) trial.report(-1 * mean_reward) del env, model gc.collect() return -1 * mean_reward
def test_baselineEnv(): try: import gym from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN env = gym.make('CartPole-v1') model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=20) action, _ = model.predict(env.reset()) env.step(action) return True except Exception as er: assert False, er
def test_action_mask_run_dqn(vec_env, policy, env_class): env = vec_env([env_class]) model = DQN(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
class DqnController: """ Implements an RL (DQN) controller """ def __init__(self, env): """ :param: env: a thermostat environment """ self.env = env self.model = DQN(MlpPolicy, env, verbose=1, tensorboard_log="./dqn_thermostat_tensorboard/") @staticmethod def name(): return "Dqn" def train(self): self.model.learn(total_timesteps=50000) def save(self): self.model.save("dqn.pk") def load(self): self.model = None self.model = DQN.load("dqn.pk") def simulate(self): state = self.env.reset() cumulative_reward = 0.0 P_consumed = [] done = False while not done: action, _state = self.model.predict(state) state, reward, done, info = self.env.step(action) cumulative_reward += reward P_consumed.append(action) print("MSE Setpoint- realized: %.3f - Energy consumed: %.2f" % (cumulative_reward, sum(P_consumed))) result_folder = "results/" + self.name( ) + "/" + self.env.start_date.strftime( "%m-%d-%Y") + "_to_" + self.env.end_date.strftime("%m-%d-%Y") self.env.store_and_plot(result_folder) def set_env(self, env): self.env = env
def AirRaid_main(): env = retro.make('AirRaid-Atari2600', use_restricted_actions=retro.Actions.DISCRETE) model = DQN(CnnPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("AirRaid_Model") del model model = DQN.load("AirRaid_Model") obs = env.reset() while True: action, _states = model.predict(obs) obs, rew, done, info = env.step(action) #env.render() if done: obs = env.reset() env.close()
def run_model(algorithm, training_timesteps, testing_timesteps, training_iterations, testing_iterations, learning_rate, batch_size): columns = ['Mean Rewards', 'Standard deviation'] df = pd.DataFrame(columns=columns) if (algorithm == "PPO2"): from stable_baselines.common.policies import MlpPolicy model = PPO2(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname), n_steps = batch_size) else: from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname), batch_size = batch_size) for k in range(training_iterations): # Train the agent model.learn(total_timesteps=int(training_timesteps)) # Saving the model model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname)) subprocess.Popen(path_for_kill_file, shell=True) for j in range(testing_iterations): # Load the trained agent if (algorithm == "PPO2"): model = PPO2.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname)) else: model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname)) # Reset the environment obs = env.reset() # Create an empty list to store reward values final_rewards = [] for _ in range(testing_timesteps): # predict the values action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones == True: final_rewards.append(rewards) # Print the mean reward print(np.mean(final_rewards)) # Print the standard deviation of reward print(np.std(final_rewards)) # Create a DataFrame to save the mean and standard deviation df = df.append({'Mean Rewards': np.mean(final_rewards), 'Standard deviation': np.std(final_rewards)}, ignore_index=True) df.to_csv("{}_{}_{}".format(algorithm, hostname, "MeanAndStdReward.csv", sep=',',index=True)) subprocess.Popen(path_for_kill_file, shell=True) subprocess.Popen(path_for_kill_file, shell=True)
def run(): # hyperparameters gamma = 0.99 #discount factor learning_rate = 0.00025 #learning rate for adam optimizer buffer_size = 50000 #size of the replay buffer exploration_fraction = 0.1 #fraction of entire training period over which the exploration rate is annealed exploration_final_eps = 0.02 #final value of random action probability exploration_initial_eps = 1.0 #initial value of random action probability train_freq = 1 #update the model every train_freq steps. set to None to disable printing batch_size = 32 #size of a batched sampled from replay buffer for training double_q = True #whether to enable Double-Q learning or not. learning_starts = 100 #how many steps of the model to collect transitions for before learning starts timesteps = 1000 #2000 verbose = 1 env = gym.make('Boxoban-Train-v1') model = DQN(MlpPolicy, env, gamma=gamma, learning_rate=learning_rate, buffer_size=buffer_size, exploration_fraction=exploration_fraction, exploration_final_eps=exploration_final_eps, exploration_initial_eps=exploration_initial_eps, train_freq=train_freq, batch_size=batch_size, double_q=double_q, learning_starts=learning_starts, verbose=1) model.learn(total_timesteps=timesteps) model.save("trained_models/dqn_sokoban_model") # Enjoy trained agent obs = env.reset() print(model.action_probability(obs)) while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
def recieve(sid, data): global done global reward global maxactions jsonInput = json.loads(data) maxactions = jsonInput['maxactions'] trainepisodes = jsonInput['trainepisodes'] evalepisodes = jsonInput['evalepisodes'] totalepisodes = trainepisodes + evalepisodes env = UnrealEnvWrap() # wrap it env = make_vec_env(lambda: env, n_envs=1) # Train the agent with different algorityhms from stable baselines #model = DQN(MlpPolicy, env, verbose=1, tensorboard_log="./DQN_newobservations/") model = DQN(MlpPolicy, env, verbose=1) #model = A2C(MlpPolicy, env, verbose=1, tensorboard_log="./A2C_newobservations/") #model = A2C(MlpPolicy, env, verbose=1) print("Agent training in process...") model.learn(total_timesteps=trainepisodes) # Test the trained agent, (currently not needed, all testing occurs in Unreal itself) env.render(mode='console') #env.render() obs = env.reset() print("Training complete, Starting Evaluation of Trained Model:") intaction = 0 #Begin strategic behvaior for step in range(evalepisodes): action, _ = model.predict(obs, deterministic=True) intaction = action[0] print("Action: ", intaction) obs, reward, done, info = env.step(action) print('obs=', obs, 'reward=', reward, 'done=', done) sio.disconnect(sid)
def main(): # create the environment env = gym.make("gym_balanceBot-v0") if os.path.isfile("trained_model/dqn_balanceBot.zip") == False: # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=int(2e5)) # Save the agent model.save("trained_model/dqn_balanceBot") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("trained_model/dqn_balanceBot") # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) else: # Load the trained agent model = DQN.load("trained_model/dqn_balanceBot") # Enjoy trained agent obs = env.reset() for i in range(3000): action, states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() sleep(1. / 240.) env.close()
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_dqn(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = DQN("MlpPolicy", env, verbose=0, **model_params) print("DOING LEARING dqn") original_env.force_progression = False model.learn(int(2e4), seed=seed) print("DONE LEARING dqn") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
symptomatic_infection = array[j][5] / total_symptomatic_infection asymptomatic_infection = array[j][4] / total_asymptomatic_infection recovered = array[j][6] / (total_recovered + 0.01) pathogen = array[j][7] / total_pathogen env = DistributionEnv(1, 1, 1000000, susceptible, exposed, symptomatic_infection, asymptomatic_infection, recovered, pathogen, 200) nn_model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) nn_model.learn(total_timesteps=int(1e4), log_interval=10000) observation = env.reset() action, states = nn_model.predict(observation) observation, done, reward = env.step(action) day_actions.append(action) day_rewards.append(reward) actions.append(day_actions) rewards.append(day_rewards) distributions = [] for action in actions: s = sum(action) l = [] for a in action: l.append(a / s) distributions.append(l) print(actions) print(distributions) print(rewards)
class DQNAgentBaseline(BaseAgent): def __init__(self, state_size, action_size, agent_settings, is_agent_to_load, env, signal_done, signal_episode, statistic: StatisticsBaseline, game_settings, game_type, agent_to_load_directory, game_name): """ :param state_size: :param action_size: :param agent_settings: :param is_agent_to_load: :param env: :param signal_done: :param signal_episode: :param statistic: :param game_settings: :param game_type: can be box or atari """ super().__init__(state_size, action_size, agent_settings, is_agent_to_load, game_name) self.env = env self.is_baseline = True self.signal_done = signal_done self.signal_episde = signal_episode self.statistic = statistic self.game_settings = game_settings self.gamma = agent_settings.gamma self.learning_rate = agent_settings.learning_rate self.epsilon_decay = agent_settings.exploration_decay self.epsilon_min = agent_settings.mnimal_exploration self.batch_size = agent_settings.mini_batch self.replay_size=agent_settings.replay_size self.last_episode_emited=0 self.game_type = game_type self.start_time = time.time() self.last_save_time = time.time() if is_agent_to_load: self.load_model(agent_to_load_directory) else: self.build_model() def build_model(self): if self.game_type=="box": self.env = DummyVecEnv([lambda: self.env]) self.model = DQN(MlpPolicy, self.env, verbose=0,gamma=self.gamma,exploration_fraction=self.epsilon_decay,exploration_final_eps=self.epsilon_min,learning_rate=self.learning_rate,buffer_size=self.replay_size,batch_size=self.batch_size) if self.game_type=="atari": self.model = DQN(CnnPolicy, self.env, verbose=1,gamma=self.gamma,exploration_fraction=self.epsilon_decay,exploration_final_eps=self.epsilon_min,learning_rate=self.learning_rate,buffer_size=self.replay_size,batch_size=self.batch_size) def update_target_model(self): super().update_target_model() def get_action(self, state): action, _states = self.model.predict(state) return action def append_sample(self, state, action, reward, next_state, done): super().append_sample(state, action, reward, next_state, done) def save_model(self,file_name="./models/agentDQN"): self.model.save(file_name) out=open(file_name+".txt","w") out.write(self.game_name) out.close() def load_model(self,agent_to_load_directory): if agent_to_load_directory=="": self.model=DQN.load("./models/agentDQN.pkl",env=self.env) else: self.model=DQN.load(agent_to_load_directory,env=self.env) def train_model(self): self.model.learn(total_timesteps=self.game_settings.max_steps_number,callback=self.callback) def callback(self,_locals, _globals): self.statistic.append_score(_locals['episode_rewards'],_locals['episode_rewards'].__len__()) if _locals['episode_rewards'].__len__()!=self.last_episode_emited and _locals['episode_rewards'].__len__()>1: self.signal_episde.emit(_locals['episode_rewards'].__len__()-1,self.statistic.get_current_mean_score(),_locals['episode_rewards'][-2],_locals['_']) self.last_episode_emited=_locals['episode_rewards'].__len__() if self.statistic.get_current_mean_score()>=self.game_settings.target_accuracy or _locals['_']+1>=self.game_settings.max_steps_number: self.signal_done.emit(_locals['episode_rewards'].__len__(), self.statistic.get_current_mean_score()) self.done=True output=open("./models/trenningResults.txt","w") output.write("czas trening:"+str((time.time()-self.start_time)/3600)+"h \n") output.write("liczba epizodów:"+str(_locals['episode_rewards'].__len__()) + "\n") output.write("liczba kroków:" + str(_locals['_']) + "\n") output.close() return False if time.time()-self.last_save_time>60*10: output = open("./models/trenningResults.txt", "w") output.write("czas trening:" + str((time.time() - self.start_time) / 3600) + "h \n") output.write("liczba epizodów:" + str( _locals['episode_rewards'].__len__()) + "\n") output.write("liczba kroków:" + str(_locals['_']) + "\n") output.close() self.last_save_time=time.time() self.save_model("./models/agentDQNtemp") return True
model = DQN.load('../model/DQN_without_prioritized', env=env) result = {} mean_reward = [] scores = [] episodes = 1000 with open("../result/DQN_without_prioritized.txt", "w") as txtfile: for episode in range(1, episodes+1): print(f"episode: {episode}") state = env.reset() done = False temp_result = {} score = 0 while done!= True: action, _states = model.predict(state) n_state, reward, done, info = env.step(action) score+=reward mean_reward.append(score) scores.append(info[0]['score']) temp = str(episode) + "," + str(score[0]) + "," + str(info[0]['score']) + "\n" txtfile.write(temp) mean = sum(mean_reward)/len(mean_reward) mean_score = sum(scores)/len(scores) print(f"The mean reward is {mean}") print(f"The mean score reward is {mean_score}") print(f"The max score is {max(scores)}")
def main(): # parameters for the gym_carla environment params = { 'number_of_vehicles': 25, 'number_of_walkers': 0, 'display_size': 256, # screen size of bird-eye render 'max_past_step': 1, # the number of past steps to draw 'dt': 0.1, # time interval between two frames 'discrete': True, # whether to use discrete control space 'continuous_accel_range': [-3.0, 3.0], # continuous acceleration range 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'town': 'Town06', # which town to simulate 'task_mode': 'acc_1', # mode of the task, [random, roundabout (only for Town03)] 'max_time_episode': 1000, # maximum timesteps per episode 'max_waypt': 12, # maximum number of waypoints 'obs_range': 32, # observation range (meter) 'lidar_bin': 0.125, # bin size of lidar sensor (meter) 'd_behind': 12, # distance behind the ego vehicle (meter) 'out_lane_thres': 2.0, # threshold for out of lane 'desired_speed': 16.67, # desired speed (m/s) 'max_ego_spawn_times': 200, # maximum times to spawn ego vehicle 'display_route': True, # whether to render the desired route 'pixor_size': 64, # size of the pixor labels 'pixor': False, # whether to output PIXOR observation 'RGB_cam': True, # whether to use RGB camera sensor } solver_params = { 'layers': [64, 64, 64], 'alpha': 0.001, 'gamma': 0.99, 'epsilon': 0.1, 'replay_memory_size': 500000, 'update_target_estimator_every': 10000, 'batch_size': 64, } # Set gym-carla environment env = gym.make('carla-v0', params=params) #check_env(env) obs = env.reset() checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./dqn_checkpoint/', name_prefix='dqn_check') #model = DQN.load("./dqn_checkpoint/dqn_check_200_steps.zip",env=env,tensorboard_log="./dqn) model = DQN('LnMlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1, tensorboard_log="./dqn") model.learn(total_timesteps=35000, tb_log_name="35k-with_checkpoint", callback=checkpoint_callback) model.save("deepq_carla") del model # remove to demonstrate saving and loading model = DQN.load("deepq_carla") obs = env.reset() for i in range(100): while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones: obs = env.reset() break
model.learn(total_timesteps=1000000) model.save("./models/dqn_snake_multi_player") else: model = DQN.load("./models/dqn_snake_multi_player") print("finished training, now use the trained model and render the env") n_episodes = 1 turn = 0 done_running = n_snakes while done_running > 0: env.render() if turn == 0: turn = 1 action, state = model.predict(obs) obs, reward, done, info = env.step(action) if done: done_running -= 1 env.render() elif turn == 1: turn = 0 action, state = model.predict(obs) obs, reward, done, info = env.step(action) if done: done_running -= 1 env.render() env.close()
gamma=0, exploration_fraction=0.6, exploration_final_eps=0, learning_rate=5e-4) #model = DQN.load("VSL_iter9600ver2.zip", env=env) start = time.time() model.learn(total_timesteps=time_steps, callback=callback) end = time.time() model.save(env_id + "_iter" + str(time_steps) + "_lane" + str(num_lanes)) print("Training time: ", end - start) #Results Plot #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "Speed Limit Manager") #plt.show() #Additional Logs for k in range(3): array = np.zeros(shape=(10, 10)) for i in range(10): for j in range(10): obs = [i, j, k + 1] array[i][j] = model.predict(obs, deterministic=True)[0] ax = sns.heatmap(array, linewidth=0.5) plt.show() #Run Simulation after training #obs = env.reset() #for _ in range(1000): # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render()
actions = [] # noisy conditions S_n = [noisyObs[0]] I_n = [noisyObs[1]] R_n = [noisyObs[2]] # max steps(days) for test max_steps = 100 n_steps = 0 # for tracking number of steps taken for step in range(max_steps): # increment n_steps += 1 noisy_obs = env.noisy_state action, _ = model.predict(noisy_obs, deterministic=True) obs, reward, done, info = env.step(action) # save data to be plotted S.append(obs[0]) I.append(obs[1]) R.append(obs[2]) actions.append(action) # print update print("Step {}".format(step + 1)) print("Action: ", action) print('obs=', obs, 'reward=', reward, 'done=', done) if done: print("Done.", "reward=", reward)
from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN env = gym.make('ModuleSelect-v1') model = DQN( env=env, policy=MlpPolicy, verbose=1, ) print("> start train test") model.learn(total_timesteps=1000) env.close() print("save the model") model.save("test_dqn_model.pkl") del model model = DQN.load("test_dqn_model.pkl") obs = env.reset() print("> start load test") for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.reset() env.close()
totalSteps = 0 totalRewSum = 0.0 bestSolution = -1 solvedDifferences = 0 for i_episode in range(1000): obs = env.reset() t = 0 rewSum = 0 useless = 0 sameStep = 0 badGood = 0 while True: act, _states = model.predict(obs) obs, reward, done, info = env.step(act) print(obs) env.render() if info is not None: print() print("=== DEBUG INFO ===") print("Step: {0}/{1}".format(info[0]["current_step"], info[0]["max_step"])) print("Reward: ", info[0]["reward"]) totalRewSum += float(info[0]["reward"]) print("==== ACT INFO ====")
class CustomDQNPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomDQNPolicy, self).__init__(*args, **kwargs, layers=[16, 16], layer_norm=False, feature_extraction="mlp") model = DQN(CustomDQNPolicy, env, verbose=1) #model.learn(total_timesteps=25000) #generate_expert_traj(model, "I:\Code\BachelorThesis\cartpole\data\expert_cartpole", n_episodes=10) #test it reward_sum = 0.0 obs = env.reset() for i in range(0, 10): done = False while not done: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward env.render() print(reward_sum) reward_sum = 0.0 obs = env.reset() env.close()
def train_once(graph: nx.Graph, clusters: list, pos: dict, env_name: str='Controller-Select-v0', compute_optimal: bool=True, trained_model: DQN=None, steps: int=2e5, logdir: str='train_log_compare', env_kwargs: dict={}) -> (DQN, float, float): """ Main training loop. Initializes RL environment, performs training, and outputs results Args: graph (nx.Graph): NetworkX graph to train on clusters (list): List of lists of nodes in each cluster pos (dict): Graph rendering positions env_name (str): Name of Gym environment compute_optimal (bool): Whether to compute optimal set of controllers by brute-force trained_model (DQN): Provide starting model to train on Return: Trained model """ # Selecting controllers one-at-a-time environment env = gym.make(env_name, graph=graph, clusters=clusters, pos=pos, **env_kwargs) heuristic_controllers, heuristic_distance = env.compute_greedy_heuristic() print("WMSCP Greedy Heuristic: {}, {}".format(heuristic_controllers, heuristic_distance)) #for i in range(1000): # env.reset() # print(env.graph.size(weight='weight')) orig_graph = env.original_graph optimal_controllers = None if compute_optimal: print("Computing optimal!") optimal_controllers = env.calculateOptimal() # Generate custom replay buffer full of valid experiences to speed up exploration of training def add_wrapper(replay_buffer): # Replay buffer maxsize is by default 50000. Should this be lowered? # valid_controllers_set = [env._random_valid_controllers() for i in range(int(replay_buffer._maxsize * 0.5 / len(clusters)))] # Uses heuristic controller set as innitial 'random' controllers valid_controllers_set = env.graphCentroidAction() for valid_controllers in valid_controllers_set: obs_current = env.reset() # Really strange issue - obs_current follows the change in env.state, making it equal to obs! for controller in valid_controllers: (obs, rew, done, _) = env.step(controller) replay_buffer.add(obs_current, controller, rew, obs, done) # For some reason, obs is a pointer which ends up being the very last obs before reset, so need to copy obs_current = obs.copy() return replay_buffer # Agent model = None if trained_model is None: print("Creating new training model!") model = DQN(LnMlpPolicy, env, tensorboard_log=logdir, verbose=0, full_tensorboard_log=True, exploration_initial_eps=0.5, exploration_fraction=0.2, learning_starts=0, target_network_update_freq=100, batch_size=32, learning_rate=0.00025) else: print("Using provided training model!") model = trained_model model.set_env(env) model.tensorboard_log = logdir # Train the agent print("Training!") model.learn(total_timesteps=int(steps))#, callback=callback)#, replay_wrapper=add_wrapper) # Run a single run to evaluate the DQN obs = env.reset() reward = 0 #We want the last reward to be minimal (perhaps instead do cumulative?) reward_final = 0 done = False action = None final_rl_actions = [] while not done: action, states = model.predict(obs) (obs, rew, done, _) = env.step(action) final_rl_actions.append(action) reward += rew reward_final = rew # Show controllers chosen by the model env.render(mode='graph_end.png') print(env.controllers, reward_final) print("BEST EVER:") print(env.best_controllers, env.best_reward) best_reward = env.optimal_neighbors(graph, env.best_controllers) print(best_reward) average_graph = env.average_graph.copy() rl_controllers = env.controllers rl_best_controllers = env.best_controllers if env_name == 'Controller-Cluster-v0': rl_controllers.sort() rl_best_controllers.sort() cluster_len = len(clusters[0]) for i in range(len(clusters)): rl_controllers[i] -= i * cluster_len rl_best_controllers[i] -= i * cluster_len env.reset(adjust=False, full=True) nx.write_gpickle(average_graph, 'average_graph.gpickle') env.graph = average_graph.copy() for cont in rl_controllers: (_, reward_final, _, _) = env.step(cont) print("RL Controllers on average change graph {} - {}".format(env.controllers, reward_final)) env.reset(adjust=False, full=True) env.graph = average_graph.copy() for cont in rl_best_controllers: (_, reward_final, _, _) = env.step(cont) print("RL Best Controllers on average change graph {} - {}".format(env.best_controllers, reward_final)) # Show controllers chosen using heuristic centroid_controllers, heuristic_distance = env.graphCentroidAction() #centroid_controllers, heuristic_distance = env.compute_greedy_heuristic() # Convert heuristic controllers to actual if env_name == 'Controller-Cluster-v0' or env_name == 'Controller-Cluster-Options-v0': # Assume all clusters same length centroid_controllers.sort() cluster_len = len(clusters[0]) for i in range(len(clusters)): centroid_controllers[i] -= i * cluster_len env.reset(adjust=False, full=True) env.graph = average_graph.copy() for cont in centroid_controllers: (_, reward_final, _, _) = env.step(cont) env.render(mode='graph_heuristic.png') best_heuristic = reward_final print("Heuristic on average change graph {} - {}".format(env.controllers, reward_final)) #print("Heuristic optimal {} - {}".format(*env.optimal_neighbors(graph, env.controllers))) heuristic_controllers = env.controllers rl_rewards = [] heuristic_rewards = [] rl_best_rewards = [] NUM_GRAPHS = 100 for i in range(NUM_GRAPHS): rl_reward = None heuristic_reward = None rl_best_reward = None env.reset() nx.write_gpickle(env.graph, '100Graphs/graph_{}.gpickle'.format(i)) for cont in final_rl_actions: (_, rl_reward, _, _) = env.step(cont) env.reset(adjust=False, full=False) for cont in centroid_controllers: (_, heuristic_reward, _, _) = env.step(cont) env.reset(adjust=False, full=False) for cont in rl_best_controllers: (_, rl_best_reward, _, _) = env.step(cont) print("RL REWARD, RL BEST REWARD, HEURISTIC: {}\t{}\t{}".format(rl_reward, rl_best_reward, heuristic_reward)) rl_rewards.append(rl_reward) heuristic_rewards.append(heuristic_reward) rl_best_rewards.append(rl_best_reward) def create_hist(fig, data, title=None, color=None): bins = np.arange(min(data) - 100, max(data) + 100, 100) plt.xlim([min(data) - 100, max(data) + 100]) fig.hist(data, bins=bins, alpha=0.5, color=color) if title: fig.title(title) plt.xlabel('Controller Distances') plt.ylabel('Count') fig = plt.figure() ax1 = fig.add_subplot(2, 1, 1) create_hist(ax1, rl_rewards, color='blue') create_hist(ax1, heuristic_rewards, color='red') create_hist(ax1, rl_best_rewards, color='green') ax2 = fig.add_subplot(2, 1, 2) ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_rewards, c='blue') ax2.plot(np.arange(0, NUM_GRAPHS, 1), heuristic_rewards, c='red') ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_best_rewards, c='green') plt.show() # Show optimal if optimal_controllers is not None: env.reset() for cont in optimal_controllers[0]: (_, reward_final, _, _) = env.step(cont) env.render(mode='graph_optimal.png') print(env.controllers, reward_final) print(optimal_controllers) return model, best_reward, best_heuristic
tensorboard_folder = './tensorboard/Bomberman/base/' model_folder = './models/Bomberman/base/' if not os.path.isdir(tensorboard_folder): os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = 'Cnn' model_tag = 'Cnn' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv()]) env = VecFrameStack(env, 2) model = DQN(CustomCnnPolicy, env, verbose=0, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='DQN' + model_tag) model.save(model_folder + "DQN" + model_tag) del model model = DQN.load(model_folder + "DQN" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
import gym from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN env = gym.make('Atlantis-ram-v4') model = DQN(MlpPolicy, env, verbose=3) model.learn(total_timesteps=1000000, log_interval=1) observation = env.reset() total_reward = 0 for i in range(18000): action, _states = model.predict(observation) observation, reward, done, info = env.step(action) env.render() total_reward += reward if done: break print(total_reward)
def launchAgent(env_name: int, model_name: str, test_mode=False, filepath=None): """ :param test_mode: 에이전트를 테스트 모드로 불러와 주행시킬지를 확인하는 모드입니다. 이럴 시에 학습은 이루어지지 않으며, 주행만 이루어집니다. :param env_name: 불러올 환경의 이름입니다. 1 : 미니맵 이미지를 사용하지 않은, 점 사이의 거리 계산을 한 환경입니다. 2 : 미니맵 이미지를 사용하고, 보상을 업데이트한 모델입니다. 다른 값(기본) : 현재 쓰는 모델입니다. 미니맵 이미지를 사용하고, 보상을 다시 업데이트한 모델입니다. :param model_name: 설정할 모델의 이름입니다. DQN : DQN 모델을 불러옵니다. HER : HER 모델을 불러옵니다. 다른 값(기본) : PPO2 모델을 불러옵니다. :return: 마지막으로 episode를 수행한 모델을 return합니다. """ from stable_baselines import DQN, HER, PPO2 if env_name == 1: from Reinforcement_AI.env.a_env import KartEnv kart_env = KartEnv() policy = "MlpPolicy" elif env_name == 2: from Reinforcement_AI.env.d_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv1 kart_env = DetailedMiniMapEnv1() policy = "CnnPolicy" elif env_name == 3: from Reinforcement_AI.env.a_env2 import KartEnv kart_env = KartEnv() policy = "MlpPolicy" elif env_name == 4: from Reinforcement_AI.env.a_env3 import KartEnv kart_env = KartEnv() policy = "MlpPolicy" else: #env_name == "detailed_minimap_enhanced" or env_name == "4": from Reinforcement_AI.env.e_enhanced_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv2 kart_env = DetailedMiniMapEnv2() policy = "CnnPolicy" if model_name == "DQN": model = DQN(policy=policy, env=kart_env, double_q=True, prioritized_replay=True, verbose=1) elif model_name == "HER": model = HER(policy=policy, env=kart_env, model_class=DQN, verbose=1) else: # model_name == "PPO2" model = PPO2(policy=policy, learning_rate=0.0001, env=kart_env, verbose=1) if test_mode: # 테스트 모드일때 에이전트 불러와서 작동하게함 model.load(filepath) kart_env.set_continuos(True) while True: observation = kart_env.reset() while True: action, _states = model.predict(observation) observation, rewards, dones, info = kart_env.step(action) if dones: break else: for i in range(1000): model.learn(total_timesteps=12500) model.save(str(env_name) + "_" + model_name + "_" + str(i + 1))
import gym_donkeycar import numpy as np from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN #SET UP ENVIRONMENT os.environ[ 'DONKEY_SIM_PATH'] = f"./DonkeySimMac/donkey_sim.app/Contents/MacOS/donkey_sim" os.environ['DONKEY_SIM_PORT'] = str(9091) os.environ['DONKEY_SIM_HEADLESS'] = str(1) # "1" is headless env = gym.make("donkey-warehouse-v0") #gym.make("donkey-generated-roads-v0") timesteps = 100000 # Set this to a reasonable number model_name = "dqn_model" # Change the model name to your preferences training = True # Change this to test or use the model if training: model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=timesteps) model.save(model_name) else: model = DQN.load(model_name) obv = env.reset() for t in range(10000): action, _states = model.predict(obv) # drive straight with small speed # execute the action obv, reward, done, info = env.step(action)
class Defense: def __init__(self, method, K, P, adverse_set_prob=0.0, disj_supp_prob=0.0, model_state=np.array([])): self.method = method self.K = K self.state_size = 2 * (self.K + 1) self.action_size = 2 self.reward = [] self.adverse_set_prob = adverse_set_prob self.disj_supp_prob = disj_supp_prob env_name = 'ErdosDefender-v0' self.log_dir = "/tmp/gym/" os.makedirs(self.log_dir, exist_ok=True) env = gym.make(env_name) env.init_params(K, P, adverse_set_prob, disj_supp_prob, model_state) env = Monitor(env, self.log_dir, allow_early_resets=True) self.envs = DummyVecEnv([lambda: env]) if method == 'PPO': self.model = PPO2(MLP_PPO, self.envs, verbose=0) elif method == 'DQN': self.model = DQN(MLP_DQN, self.envs, verbose=0) elif method == 'A2C': self.model = A2C(MLP_A2C, self.envs, verbose=0) else: raise Exception("Erreur ! Méthode: 'PPO' ou 'DQN' ou 'A2C'") print("Model Initialized !") self.best_mean_reward, self.n_steps = -np.inf, 0 def callback(self, _locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ # Print stats every 1000 calls if (self.n_steps + 1) % 1000 == 0: # Evaluate policy performance x, y = ts2xy(load_results(self.log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(self.best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward self.n_steps += 1 return True def learn(self, timesteps=10000): self.model.learn(total_timesteps=timesteps, callback=self.callback) print("======\n{} LEARNING DONE DEFENSE\n======".format(self.method)) def printViz(self, viz, k, N): plt.figure(figsize=(12, 8)) for el in range(k + 1): plt.axhline(y=el - 0.5, linestyle='-') for el in range(N + 1): plt.axvline(x=el - 0.5, linestyle='-') plt.xticks(np.arange(N + 1)) plt.yticks(np.arange(k + 1)) plt.imshow(viz, origin='lower', cmap='gray', interpolation="none") plt.show() def simulate_trainedDefender(self): initial_state = self.envs.reset() A = initial_state[0][:self.K + 1] B = initial_state[0][self.K + 1:] N = np.sum(initial_state) viz = np.zeros((self.K + 1, N)) for ind, el in enumerate((A + B).reshape(-1, 1)): viz[ind, :int(el)] = np.ones(int(el)) print("Start..") print("Initial state:", A + B) self.printViz(viz, self.K, N) time.sleep(2) state = np.reshape(np.array(initial_state), [1, self.state_size]) done = False while not done: clear_output(wait=True) print("Attacker turn..") partitionA = state[0][:self.K + 1] partitionB = state[0][self.K + 1:] print("Partitions : ", partitionA, partitionB) viz = np.zeros((self.K + 1, N)) for i in range(self.K): ind1 = int(partitionA[i]) ind2 = int(partitionB[i]) viz[i, ind1:(ind1 + ind2)] = np.ones(ind2) * 0.3 viz[i, :ind1] = np.ones(ind1) * 0.2 self.printViz(viz, self.K, N) time.sleep(2) viz = np.zeros((self.K + 1, N)) clear_output(wait=True) print("Defender turn..") action, _states = self.model.predict(state) state, reward, done, _ = self.envs.step(action) if (len(_[0]) != 0): state = _[0]['terminal_observation'] state = np.reshape(np.array(state), [1, self.state_size]) state = np.reshape(np.array(state), [1, self.state_size]) A = state[0][:self.K + 1] B = state[0][self.K + 1:] if (action[0] == 1): print("Defender keeps:", partitionA) if partitionA[-1] > 0 or np.sum(partitionA) == 0: done = True else: print("Defender keeps:", partitionB) if partitionB[-1] > 0 or np.sum(partitionB) == 0: done = True for ind, el in enumerate((A + B).reshape(-1, 1)): if ind > 0: viz[ind, :int(el)] = np.ones(int(el)) self.printViz(viz, self.K, N) time.sleep(2) if done: if reward == 1: print("Defender wins!!") else: print("Attacker wins!!") else: partitionA = A partitionB = B def run(self, nb_episodes=1000): self.reward = [] self.nb_episodes = nb_episodes for index_episode in range(nb_episodes): state = self.envs.reset() state = np.reshape(np.array(state), [1, self.state_size]) done = False steps = 0 while not done: action, _states = self.model.predict(state) next_state, reward, done, _ = self.envs.step(action) next_state = np.reshape(np.array(next_state), [1, self.state_size]) state = next_state steps += 1 if index_episode % 100 == 0: print("Episode {}#; \t Nb of steps: {}; \t Reward: {}.".format( index_episode, steps + 1, reward)) if index_episode > 0: self.reward += [ ((self.reward[-1] * len(self.reward)) + reward) / (len(self.reward) + 1) ] else: self.reward += [reward]