log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) env = gym.make("OffWorldDockerMonolithDiscreteSim-v0", channel_type=Channels.RGB_ONLY) time_steps = 200000 name = "Offworld_DQN4" env = Monitor(env, log_dir) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) model = DQN("MlpPolicy", env, gamma=0.95, learning_rate=1e-3, verbose=0, buffer_size=1000, batch_size=16, exploration_fraction=0.9, exploration_final_eps=0.1, exploration_initial_eps=1.0, train_freq=1) print(type(callback)) #, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, name) plt.savefig(name + '.png') model.save(name) model = DQN.load(name)
# Effective code with DQN package import gym from stable_baselines3.common.env_util import make_atari_env from stable_baselines3.common.vec_env import VecFrameStack from stable_baselines3 import DQN env = make_atari_env('Assault-v0', n_envs=1, seed=0) env = VecFrameStack(env, n_stack=1) model = DQN('CnnPolicy', env, verbose=1, tensorboard_log="./DQN_log/") model.learn(total_timesteps=int(4e4)) obs = env.reset() obs_ = obs.transpose(3,0,1,2) while True: action, _states = model.predict(obs_) obs, rewards, dones, info = env.step(action) env.render()
import gym from stable_baselines3 import DQN env = gym.make('CartPole-v0') model = DQN.load("dqn_cartpole") obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
import gym from stable_baselines3 import DQN from stable_baselines3.dqn import MlpPolicy import gym_rock_paper_scissors from gym_rock_paper_scissors.utils.eval import eval_rock_paper_scissors_agent import os # install custom env using pip first sequence_env = gym.make("RockPaperScissorsSequencePolicy2Env-v0", other_sequence=True) random_env = gym.make("RockPaperScissorsRandomPolicyEnv-v0") biased_env = gym.make("RockPaperScissorsBiasedPolicyEnv-v0") agent = DQN(MlpPolicy, sequence_env, verbose=1) agent.learn(total_timesteps=80000, log_interval=4) agent.save("dqn_rps") del agent agent = DQN.load("dqn_rps") score = eval_rock_paper_scissors_agent(agent, sequence_env) print(score) os.remove("dqn_rps.zip")
from utils import SaveOnBestTrainingRewardCallback # setup CHECKPOINT_STR = datetime.now().strftime("%Y.%m.%d-%H:%M:%S") CHECKPOINT_DIR = "checkpoints/" + CHECKPOINT_STR os.makedirs(CHECKPOINT_DIR, exist_ok=True) env_name = "CartPole-v1" env = gym.make(env_name) env = Monitor(env, os.path.join( CHECKPOINT_DIR, "training_progress")) # this monitors the training for later inspection # MLP policy_name is predefined for DQN, see file stable_baselines3/dqn/policies.py policy_name = "MlpPolicy" model = DQN(policy_name, env, verbose=1) # callback for model training # saves checkpoint if current version of model is better than all before... callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=CHECKPOINT_DIR, save_path=CHECKPOINT_DIR) # training total_timesteps = 150000 model.learn(total_timesteps=total_timesteps, callback=callback) # we don't have to save manually when we use the callback in the model.learn call # model.save(os.path.join(CHECKPOINT_DIR, "mlp_dqn_cartpole")) plot_results([CHECKPOINT_DIR],
def test_dqn_custom_policy(): policy_kwargs = dict(optimizer_class=RMSpropTFLike, net_arch=[32]) _ = DQN("MlpPolicy", "CartPole-v1", policy_kwargs=policy_kwargs, learning_starts=100).learn(300)
import highway_env # env = gym.make("highway-v0") from stable_baselines3.common.env_util import make_vec_env # Parallel environments env = make_vec_env("highway-v0", n_envs=1) #############################AGENT############################################ from stable_baselines3 import DQN # model = DQN.load("MyAutonomousDrivingAgent") # use an existing model, if available model = DQN("MlpPolicy", env, verbose=2) model.learn(total_timesteps=10, log_interval=1) # from stable_baselines3 import PPO # model = PPO(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=100, log_interval=10) # model.save("MyAutonomousDrivingAgent") # del model # remove the model ##############################OBSERVATION###################################### observation = env.reset() done = False while not done: action, _states = model.predict(observation)