def test_deepq(): """ test DeepQ on atari """ logger.configure() set_global_seeds(SEED) env = make_atari(ENV_ID) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) model = DQN(env=env, policy=CnnPolicy, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6, checkpoint_freq=10000) model.learn(total_timesteps=NUM_TIMESTEPS) env.close() del model, env
def train_DQN(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir + '/', allow_early_resets=True) global n_steps, best_mean_reward best_mean_reward, n_steps = -np.inf, 0 policy = kwargs['policy'] n_timesteps = kwargs['n_timesteps'] del kwargs['policy'] del kwargs['n_timesteps'] model = DQN(policy, env, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=True, checkpoint_path=log_dir, seed=seed, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
def main(args): """ Run a trained model for the vrep :param args: (ArgumentParser) the input arguments """ env = gym.make('vrep-v0') model = DQN.load("./model/cartpole_model3000.pkl", env) model_index = [x for x in range(3000, 54001, 3000)] #model = DQN.load("./model/bk1/cartpole_model3000.pkl", env, tensorboard_log='./log') for index in model_index: model = DQN.load("./model/cartpole_model" + str(index) + ".pkl", env) all_espisode_rewards = [] all_try_steps = [] all_success = [] for i in range(100): obs, done = env.reset(), False episode_rew = 0 try_steps = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) # print('action: %d, reward: %f, counts: %d' %(action, rew, env.counts)) episode_rew += rew try_steps += 1 if done and rew > 0: success = 1 else: success = 0 print("Episode reward: %d, try_steps:%d, if success:%d " % (episode_rew, try_steps, success)) all_espisode_rewards.append(episode_rew) if success: all_try_steps.append(try_steps) else: all_try_steps.append(30) all_success.append(success) # No render is only used for automatic testing if args.no_render: break # average reward, steps, and success rate ave_rew = sum(all_espisode_rewards) / len(all_espisode_rewards) ave_step = sum(all_try_steps) / len(all_try_steps) success_rate = sum(all_success) / len(all_success) print("ave_rew:%f, ave_step:%f, success_rate:%f " % (ave_rew, ave_step, success_rate)) res = ','.join([str(ave_rew), str(ave_step), str(success_rate)]) with open('./log/test_results.txt', 'a') as f: f.write(res + '\n') env.reset_simulation() env.close()
def main(): setupOdrive() # sanity check if drive is None: print("Failed to initialize Odrive. Exiting...") exit() _setCurrent(0) nnBal = DQN.load("deepq_policy_bal.zip.quiet") nnUp = DQN.load("deepq_policy_up.zip.quiet") initObs() obs = getObs() reward = 0 i = 0 while i < LOOP_COUNT: i += 1 mark = time.time() action = 0 if checkArmVel(obs) and checkPoleVel(obs): policy = None if abs(obs[2]) > cm.deg2Rad(10): policy = "spin-up" action, _ = nnUp.predict(np.array(obs)) else: policy = "balance" action, _ = nnBal.predict(np.array(obs)) current = computeCurrent(action) print("%s\taction: %d\tcurrent: %.2f" % (policy, action, current)) setCurrent(current, obs) diff = time.time() - mark render = True while diff < LOOP_COUNT: obs = getObs(render) cur = getCurrent() render = False buf.append({"target": current_target, "obs": obs, "i": cur}) diff = time.time() - mark reward += math.cos(abs(obs[2])) - abs(current / MAX_CURRENT) * 0.001 print("Episode reward: %.1f\tdata len: %d" % (reward, len(buf_current))) _setCurrent(0) stamp = int(time.time() / 1) fname = "data/current_data_3_b_" + str(stamp) + ".json" with open(fname, 'w') as f: json.dump(buf, f) print("Wrote behavioral data to: " + fname)
def main(args): """ Run a trained model for the mountain car problem :param args: (ArgumentParser) the input arguments """ env = gym.make("MountainCar-v0") model = DQN.load("mountaincar_model.zip", env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: if not args.no_render: env.render() # Epsilon-greedy if np.random.random() < 0.02: action = env.action_space.sample() else: action, _ = model.predict(obs, deterministic=True) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew) # No render is only used for automatic testing if args.no_render: break
def main(): """ Run the atari test """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) policy = partial(CnnPolicy, dueling=args.dueling == 1) model = DQN( env=env, policy=policy, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) model.learn(total_timesteps=args.num_timesteps) env.close()
def mainHybrid(): env = fed.FurutaEnvTorqueDeepq(cm.RUN, render=True) #env.setRender(True) modelBal = DQN.load(POLICY_PATH + "deepq_policy_bal_nn.zip", env) modelUp = DQN.load(POLICY_PATH + "deepq_policy_up_nn.zip", env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D): action, _ = modelUp.predict(obs) else: action, _ = modelBal.predict(obs) obs, rew, done, _ = env.step(action) speedCheck(obs) episode_rew += rew print("Episode reward: %.3f" % (episode_rew))
def __init__(self, env=None): """Return only every `skip`-th frame""" super(FireOtherSkipEnv, self).__init__(env) # most recent raw observations (for max pooling across time steps) self._obs_buffer = deque(maxlen=2) self.env = wrap_atari_dqn(self.env) model_output='/home/jingjia16/stable-baselines/scripts/deepq_pong.zip' if os.path.exists(model_output): self.model = DQN.load(model_output) else: print("failed to load the model")
def test_DQN(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) model = DQN.load(os.path.join(out_dir, 'best_model.pkl')) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=10000) return
def main(self, args): """ Train and save the DQN model, for the cartpole problem :param args: (ArgumentParser) the input arguments """ #env = gym.make('CartPole-v1') #model = DQN(MlpPolicy, env, verbose=1) #model.load("cartpole_model.pkl") model = DQN(env=env, policy=CustomPolicy, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.02, verbose=1) model.learn(total_timesteps=args.max_timesteps, callback=self.callback) print("Saving model to cartpole_model.pkl") model.save("cartpole_model.pkl") #if __name__ == '__main__': #parser = argparse.ArgumentParser(description="Train DQN on cartpole") #parser.add_argument('--max-timesteps', default=100000000, type=int, help="Maximum number of timesteps") #args = parser.parse_args() #main(args)
def test(env, agent, filepath): model = DQN.load(filepath) obs = env.reset() episode_count = 0 while (True): if(episode_count == settings.testing_nb_episodes_per_model): exit(0) else: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if(dones == True): env.reset() episode_count += 1
def load_model(nw_type, log_dir, env_name, log_name, save_name, model=None): log_dir = log_dir + env_name + '/' if log_name is not None: log_dir = log_dir + log_name + '/' if nw_type.lower() == 'deepq' or nw_type.lower( ) == 'deep' or nw_type.lower() == 'dqn': return DQN.load(log_dir + 'DeepQ/' + save_name + '/' + save_name + '.pkl') elif nw_type.lower() == 'ppo' or nw_type.lower( ) == 'ppo2' or nw_type.lower() == 'ppo1': return PPO2.load(log_dir + 'PPO/' + save_name + '/' + save_name + '.pkl') elif nw_type.lower() == 'ddqn': model.load_weights(log_dir + 'DoubleDQN/' + save_name + '/' + save_name + '.h5f') return model
def setup(difficulty_level='default', env_name = "AirSimEnv-v42"): config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.6 config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) env = gym.make(env_name) env.init_again(eval("settings."+difficulty_level+"_range_dic")) # Vectorized environments allow to easily multiprocess training # we demonstrate its usefulness in the next examples vec_env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run agent = DQN(MlpPolicy, vec_env, verbose=1) env.set_model(agent) return env, agent
def mainUp(): env = fed.FurutaEnvTorqueDeepqUp(cm.RUN, render=True) #env.setRender(True) model = DQN.load(POLICY_PATH + "deepq_policy_up.zip", env) while True: obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) speedCheck(obs) episode_rew += rew count += 1 print("Episode average reward: %.3f" % (episode_rew / count))
def main(): """ Run a trained model for the pong problem """ env = gym.make("PongNoFrameskip-v4") env = deepq.wrap_atari_dqn(env) model = DQN.load("pong_model.pkl", env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew)
def main(args): """ Train and save the DQN model, for the cartpole problem :param args: (ArgumentParser) the input arguments """ # env = gym.make("CartPole-v0") # model = DQN( # env=env, # policy=MlpPolicy, # verbose=1, # learning_rate=1e-3, # buffer_size=50000, # exploration_fraction=0.1, # exploration_final_eps=0.02, # tensorboard_log='./log', # ) # model.learn(total_timesteps=args.max_timesteps, callback=callback) # print("Saving model to cartpole_model.pkl") # model.save("cartpole_model.pkl") # env = Vrep_Env() env = gym.make('vrep-v0') model = DQN( env=env, gamma=0.95, policy=MlpPolicy, #policy=CustomPolicy, verbose=1, learning_rate=1e-4, buffer_size=50000, #5000 train_freq=1, learning_starts=100, batch_size=64, # 32 checkpoint_freq=3000, checkpoint_path='./model/', target_network_update_freq=300, prioritized_replay=True, exploration_fraction=0.1, exploration_final_eps=0.02, tensorboard_log='./log', ) # path = './model/' # model = DQN.load(path+'bk2_16/cartpole_model6000.pkl', env, tensorboard_log='./log') model.learn(total_timesteps=args.max_timesteps, callback=callback, log_interval=30) print("Saving model to slab_installing_model.pkl") model.save("slab_installing_model.pkl")
def main(args): """ Run a trained model for the cartpole problem :param args: (ArgumentParser) the input arguments """ env = gym.make("CartPole-v0") model = DQN.load("cartpole_model.zip", env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: if not args.no_render: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew) # No render is only used for automatic testing if args.no_render: break
def train(env, fname): env.setRender(False) env.reset() start = time.time() model = DQN( env=env, policy=CustomPolicy, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02 ) model.learn(total_timesteps=STEPS, callback=callback) # save trained model model.save(fname) print("Duration: %.1f" % ((time.time() - start)/60))
def main(args): """ Train and save the DQN model, for the cartpole problem :param args: (ArgumentParser) the input arguments """ env = gym.make("CartPole-v0") model = DQN( env=env, policy=MlpPolicy, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, ) model.learn(total_timesteps=args.max_timesteps, callback=callback) print("Saving model to cartpole_model.pkl") model.save("cartpole_model.pkl")
def main(args): """ Train and save the DQN model, for the mountain car problem :param args: (ArgumentParser) the input arguments """ env = gym.make("MountainCar-v0") # using layer norm policy here is important for parameter space noise! model = DQN(policy=CustomPolicy, env=env, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, param_noise=True) model.learn(total_timesteps=args.max_timesteps) print("Saving model to mountaincar_model.pkl") model.save("mountaincar_model.pkl")
if __name__ == '__main__': env = SumoEnvironment(net_file='nets/2way-single-intersection/single-intersection.net.xml', route_file='nets/2way-single-intersection/single-intersection-vhvh.rou.xml', out_csv_name='outputs/2way-single-intersection/dqn-vhvh2-stable-mlp-bs', single_agent=True, use_gui=True, num_seconds=100000, time_to_load_vehicles=120, max_depart_delay=0, phases=[ traci.trafficlight.Phase(32000, 32000, 32000, "GGrrrrGGrrrr"), traci.trafficlight.Phase(2000, 2000, 2000, "yyrrrryyrrrr"), traci.trafficlight.Phase(32000, 32000, 32000, "rrGrrrrrGrrr"), traci.trafficlight.Phase(2000, 2000, 2000, "rryrrrrryrrr"), traci.trafficlight.Phase(32000, 32000, 32000, "rrrGGrrrrGGr"), traci.trafficlight.Phase(2000, 2000, 2000, "rrryyrrrryyr"), traci.trafficlight.Phase(32000, 32000, 32000, "rrrrrGrrrrrG"), traci.trafficlight.Phase(2000, 2000, 2000, "rrrrryrrrrry") ]) model = DQN( env=env, policy=MlpPolicy, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02 ) model.learn(total_timesteps=100000)
def test(env, agent, filepath): model = DQN.load(filepath) obs = env.reset() for i in range(settings.testing_nb_episodes_per_model): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
df = df.reset_index() total_timesteps = 20_000 iterations = 50_000 env = DummyVecEnv([lambda: SepsisEnv(df)]) models = [ PPO2(MlpPolicy, env, verbose=0), PPO2(MlpLstmPolicy, env, nminibatches=1, verbose=0), PPO2(MlpLnLstmPolicy, env, nminibatches=1, verbose=0), A2C(MlpPolicy, env, lr_schedule='constant'), A2C(MlpLstmPolicy, env, lr_schedule='constant'), DQN( env=env, policy=DQN_MlpPolicy, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, ), DQN( env=env, policy=LnMlpPolicy, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, ) ] for model in models: env = DummyVecEnv([lambda: SepsisEnv(df)])
env = gym.make('CarRacing-v0') env = DiscreteCarRacing(env) env = DummyVecEnv([lambda: env ]) # The algorithms require a vectorized environment to run """DDPG Algorithm """ # Add some param noise for exploration # param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.2) # n_actions = env.action_space.shape[-1] # action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # model1 = DDPG(policy=LnMlpPolicy, gamma=0.995, actor_lr=1e-4, critic_lr=1e-3, env=env, param_noise=param_noise, verbose=1) # model1 = DDPG(policy=LnMlpPolicy, gamma=0.995, actor_lr=1e-4, critic_lr=1e-3, env=env, action_noise=action_noise, verbose=1) """DQN Algorithm """ model2 = DQN(env=env, policy=MlpPolicy, learning_rate=1e-3, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, verbose=1) """PPO Algorithm """ # model3 = PPO1(policy=MlpPolicy, gamma=0.995, optim_batchsize=32, env=env, verbose=0) """Get result """ # note : All model has same named policy but from different algorithm. Can not run three models at once. # ddpg_pr_result = evaluate(model1, num_steps=10000) # ddpg_ou_result = evaluate(model1.5, num_steps=10000) dqn_result = evaluate(model2, num_steps=10000) # ppo_result = evaluate(model3, num_steps=10000)
def train_deep(env_name='CartPole-v1', steps=10000, lr=5e-4, exploration_fraction=0.1, exploration_final_eps=0.02, log_dir='./Logs/', log_name=None): """ Wrapper for training a network with DQN :param env_name: The name of the environment to load [String] :param steps: The number of time-steps to train for [Int] :param exploration_fraction: The exploration rate for the algorithm [double or whatever] :param exploration_final_eps: The final exploration rate after decay [double or whatever] :param lr: The learning rate for the algorithm [double or whatever] :param log_dir: The base log folder [String] :param log_name: Puts the logs in a subdir of this name [String] """ # Generates a folder hierarchy for the logging: if log_name is None: log_dir = log_dir + env_name + '/' + 'DeepQ/deep_{0:.0E}'.format( lr) + '/' else: log_dir = log_dir + env_name + '/' + log_name + '/' + 'DeepQ/deep_{0:.0E}'.format( lr) + '/' init_logging(log_dir) # Generates an environment for the algorithm to train against env = DummyVecEnv([ lambda: Monitor(gym.make(env_name), log_dir, allow_early_resets=True) ]) # Sets up a modified callback funtion to be able to handle saving etc. (Not really needed) best_mean_reward, n_steps, hist_rew = -np.inf, 0, 0 def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ nonlocal n_steps, best_mean_reward, hist_rew # Print stats every 1000 calls if (n_steps + 1) % 5 == 0: # Evaluate policy performance x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: # mean_rew_plot(y, len(x)) hist_rew = y.copy() mean_reward = np.mean(y[-100:]) if (n_steps + 1) % 100 == 0: print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + "/deep_{0:.0E}.pkl".format(lr)) n_steps += 1 return False # Creates the training model etc. dqn_nw = DQN('MlpPolicy', env, learning_rate=lr, exploration_fraction=exploration_fraction, exploration_final_eps=exploration_final_eps, checkpoint_freq=2000, learning_starts=1000, target_network_update_freq=500) # Starts the training: dqn_nw.learn(total_timesteps=steps, callback=callback)