def test_performance_her(online_sampling, n_bits): """ That DQN+HER can solve BitFlippingEnv. It should not work when n_sampled_goal=0 (DQN alone). """ env = BitFlippingEnv(n_bits=n_bits, continuous=False) model = DQN( "MultiInputPolicy", env, replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict( n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, max_episode_length=n_bits, ), verbose=1, learning_rate=5e-4, train_freq=1, learning_starts=100, exploration_final_eps=0.02, target_update_interval=500, seed=0, batch_size=32, buffer_size=int(1e5), ) model.learn(total_timesteps=5000, log_interval=50) # 90% training success assert np.mean(model.ep_success_buffer) > 0.90
def test_dqn_train_with_batch_norm(): model = DQN( "MlpPolicy", "CartPole-v1", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, seed=1, tau=0, # do not clone the target ) ( q_net_bias_before, q_net_running_mean_before, q_net_target_bias_before, q_net_target_running_mean_before, ) = clone_dqn_batch_norm_stats(model) model.learn(total_timesteps=200) ( q_net_bias_after, q_net_running_mean_after, q_net_target_bias_after, q_net_target_running_mean_after, ) = clone_dqn_batch_norm_stats(model) assert ~th.isclose(q_net_bias_before, q_net_bias_after).all() assert ~th.isclose(q_net_running_mean_before, q_net_running_mean_after).all() assert th.isclose(q_net_target_bias_before, q_net_target_bias_after).all() assert th.isclose(q_net_target_running_mean_before, q_net_target_running_mean_after).all()
def basic_usage_example(): # Basic Usage: Training, Saving, Loading. # Create environment. env = gym.make("LunarLander-v2") # Instantiate the agent. model = DQN("MlpPolicy", env, verbose=1) # Train the agent. model.learn(total_timesteps=int(2e5)) # Save the agent. model.save("dqn_lunar") del model # Delete trained model to demonstrate loading. # Load the trained agent. # NOTE: if you have loading issue, you can pass 'print_system_info=True' # to compare the system on which the model was trained vs the current one. #model = DQN.load("dqn_lunar", env=env, print_system_info=True) model = DQN.load("dqn_lunar", env=env) # Evaluate the agent. # NOTE: If you use wrappers with your environment that modify rewards, # this will be reflected here. To evaluate with original rewards, # wrap environment in a "Monitor" wrapper before other wrappers. mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) # Enjoy trained agent. obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render()
def train(time_steps, save=False, **params): verbose = params.get('verbose', 1) buffer_size = params.get('buffer_size', 10000) learning_starts = params.get('learning_starts', 1024) env = DQNAgent.create_env(1) model = DQN('CnnPolicy', env, verbose=verbose, buffer_size=buffer_size, learning_starts=learning_starts, tensorboard_log=TB_LOGS) model.learn(time_steps) if save: model.save(MODEL_PATH)
def ai_playing(): env = Snake_Env(server=False) # env = make_vec_env(lambda: env, n_envs=4, monitor_dir="./vec") env = Monitor(env, "1e7_bw_dqn") obs = env.reset() model = DQN("CnnPolicy", env, verbose=1, optimize_memory_usage=True, buffer_size=500000) model.learn(total_timesteps=1e7) model.save("1e7_bw_dqn")
def test_dqn(): model = DQN( "MlpPolicy", "CartPole-v1", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, buffer_size=500, learning_rate=3e-4, verbose=1, create_eval_env=True, ) model.learn(total_timesteps=500, eval_freq=250)
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True): env = flappy_env.FlappyEnv(use_dummy_video) env = Monitor(env, f"flappy_dqn_{itr}") obs = env.reset() model = DQN( "CnnPolicy", env, verbose = 1, optimize_memory_usage = True, buffer_size = 500000, learning_rate = 1e-5, tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/") model.learn(total_timesteps = timesteps) model.save(f"dqn_flappy_{itr}")
def init_and_train_rl_classification_model( timesteps, path='data/rl_rps.pth', save=True, n=2000): dm, y_oracle = init_dm(CONFIG) env = ClassificationEnv(dm, y_oracle) # env = MonitorWrapper(env, autolog=True) model = DQN(CnnPolicy, env, verbose=1) idxs = list(range(n)) dm.label_samples(idxs, y_oracle[idxs]) model.learn(total_timesteps=timesteps) if save: model.save(path) env.enable_evaluating(True) evaluate(model, env) env.enable_evaluating(False) return model
def train_dqn(): log_dir = f"model_save/" env = ENV_DISCRETE(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = DQN('MlpPolicy', env, verbose=1, batch_size=2048, seed=1) callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir) model.learn(total_timesteps=int(100000), callback=callback, log_interval=100) model.save('model_save/dqn')
def train_sqil(env, n=0): venv = gym.make(env) expert_data = make_sa_dataset(env, max_trajs=5) for i in range(n): if isinstance(venv.action_space, Discrete): model = DQN(SQLPolicy, venv, verbose=1, policy_kwargs=dict(net_arch=[64, 64]), learning_starts=1) else: model = SAC('MlpPolicy', venv, verbose=1, policy_kwargs=dict(net_arch=[256, 256]), ent_coef='auto', learning_rate=linear_schedule(7.3e-4), train_freq=64, gradient_steps=64, gamma=0.98, tau=0.02) model.replay_buffer = SQILReplayBuffer(model.buffer_size, model.observation_space, model.action_space, model.device, 1, model.optimize_memory_usage, expert_data=expert_data) mean_rewards = [] std_rewards = [] for train_steps in range(20): if train_steps > 0: if 'Bullet' in env: model.learn(total_timesteps=25000, log_interval=1) else: model.learn(total_timesteps=16384, log_interval=1) mean_reward, std_reward = evaluate_policy(model, model.env, n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Steps: {1}".format(train_steps, mean_reward)) np.savez(os.path.join("learners", env, "sqil_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards)
def test_dqn(): env = gym.make("fishing-v0") check_env(env) model = DQN("MlpPolicy", env, verbose=1) model.learn(total_timesteps=200) # Simulate a run with the trained model, visualize result df = env.simulate(model) env.plot(df, "dqn-test.png") # Evaluate model mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5) df = env.policyfn(model) env.plot_policy(df, "policy-test.png")
def train_dqn_growpsace(save_model=False): wandb.run = config.tensorboard.run wandb.tensorboard.patch(save=False, tensorboardX=True) env = gym.make(config.env_name) model = DQN("CnnPolicy", env, verbose=1, gradient_steps=20, optimize_memory_usage=True) model.learn(total_timesteps=config.num_updates, log_interval=1, callback=WandbStableBaselines3Callback()) if save_model: model.save(f"dqn_{config.env_name}")
def main(cfg: DictConfig): env = get_env(None, cfg.env) model = DQN(MlpPolicy, env, **cfg.model, tensorboard_log='logs/', verbose=1) callbacks = [TensorboardCallback()] if cfg.self_play: self_play = EveryNTimesteps(cfg.n_update_selfplay, callback=SelfPlay('ckpts/', cfg.env)) callbacks.append(self_play) if cfg.ckpt_freq: ckpt_cb = CheckpointCallback(save_freq=cfg.ckpt_freq, save_path='ckpts/') callbacks.append(ckpt_cb) model.learn(total_timesteps=cfg.n_total_steps, callback=callbacks, tb_log_name=cfg.log_name)
def train(env, type, timesteps): env.reset() print(check_env(env)) env = FlattenObservation(env) print(env.reward_range) print(env.action_space) if type == "DQN": model = DQN('MlpPolicy', exploration_fraction=0.999, env=env, verbose=1) elif type == "A2C": model = A2C('MlpPolicy', env=env, verbose=1) elif type == "PPO": model = PPO('MlpPolicy', env=env, verbose=1) model.learn(total_timesteps=timesteps) model.save("model_cups")
def train(params): model = DQN(params.get("policy"), env, verbose=1, buffer_size=params.get("buffer_size"), learning_rate=params.get("learning_rate"), tensorboard_log=log_dir, gamma=params.get("gamma"), target_update_interval=params.get("target_update_interval"), train_freq=params.get("train_freq"), gradient_steps=params.get("gradient_steps"), exploration_fraction=params.get("exploration_fraction"), exploration_final_eps=params.get("exploration_final_eps"), learning_starts=params.get("learning_starts"), batch_size=params.get("batch_size"), policy_kwargs=policy_kwargs) # Train for 1e5 steps model.learn(total_timesteps=params.get("train_steps")) # Save the trained agent model.save(exp_name)
def run_dqn_baseline(): env = make_atari_env('BreakoutNoFrameskip-v4', n_envs=1, seed=0) env = VecFrameStack(env, n_stack=4) tensorboard_log = os.path.join(os.path.dirname(__file__), 'runs_baseline') buffer_size = 100000 num_training_steps = 1000000 model = DQN('CnnPolicy', env, verbose=0, buffer_size=buffer_size, learning_starts=50000, optimize_memory_usage=False, tensorboard_log=tensorboard_log) model.learn(total_timesteps=num_training_steps) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def run(experiment: Experiment, params: argparse.Namespace): sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda) env = helper.make_env(params, 'env') # Logs will be saved in log_dir/monitor.csv env = Monitor(env) with experiment.train(): callback = SaveOnBestTrainingRewardCallback(experiment, check_freq=1000) # Deactivate all the DQN extensions to have the original version # In practice, it is recommend to have them activated model = DQN(CnnPolicy, env, learning_rate=params.learning_rate, gamma=params.gamma, seed=params.seed, max_grad_norm=params.max_grad_norm, verbose=1, device=device, policy_kwargs={'features_extractor_class': ColoringCNN}) model.learn(total_timesteps=params.max_ts, callback=callback)
def test_eval_success_logging(tmp_path): n_bits = 2 env = BitFlippingEnv(n_bits=n_bits) eval_env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=n_bits)]) eval_callback = EvalCallback( eval_env, eval_freq=250, log_path=tmp_path, warn=False, ) model = DQN( "MultiInputPolicy", env, replay_buffer_class=HerReplayBuffer, learning_starts=100, seed=0, replay_buffer_kwargs=dict(max_episode_length=n_bits), ) model.learn(500, callback=eval_callback) assert len(eval_callback._is_success_buffer) > 0 # More than 50% success rate assert np.mean(eval_callback._is_success_buffer) > 0.5
def test_eval_callback_logs_are_written_with_the_correct_timestep(tmp_path): # Skip if no tensorboard installed pytest.importorskip("tensorboard") from tensorboard.backend.event_processing.event_accumulator import EventAccumulator env_name = select_env(DQN) model = DQN( "MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32]), tensorboard_log=tmp_path, verbose=1, seed=1, ) eval_env = gym.make(env_name) eval_freq = 101 eval_callback = EvalCallback(eval_env, eval_freq=eval_freq, warn=False) model.learn(500, callback=eval_callback) acc = EventAccumulator(str(tmp_path / "DQN_1")) acc.Reload() for event in acc.scalars.Items("eval/mean_reward"): assert event.step % eval_freq == 0
#env = gym.make('CartPole-v1') env = gym.make('FrozenLake-v0') log_dir = "/tmp/gym/" os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir) model = DQN('MlpPolicy', env, verbose=1, batch_size=32, learning_starts=1000) #prioritized_replay=True model.replay_buffer = TrajReplayBuffer(model.buffer_size, model.observation_space, model.action_space, model.device, trajectory=True, seq_num=1) initial_time = round(time(), 2) model.learn(total_timesteps=int(100000)) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True) finish_time = round(time(), 2) total_time = round(finish_time - initial_time, 2) print("this run took total time of {0} seconds".format(total_time)) plot_results(log_dir)
tensorboard_log="./dqn_drone_tensorboard2/", policy_kwargs=policy_kwargs, exploration_fraction=0.4) #env_eval = Monitor(env, './logs/') eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=1000, deterministic=True, render=False) #Deeper NN #model = DQN.load("DQN", env=env) model.learn(total_timesteps=5_000_000, callback=eval_callback) # Typically not enough model.save("DQN") #model = DQN.load("DQN", env=env) model = DQN.load("logs/best_model", env=env) #model = PPO.load("PPO_discrete", env=env) logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS), num_drones=ARGS.num_drones) obs = env.reset() start = time.time() n_trial = 0 for i in range(ARGS.duration_sec * env.SIM_FREQ): if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0: action, _states = model.predict( obs,
# game = 'Zaxxon-ram-v0' #env = gym.make('Pong-v0') env = gym.make(game) # save_file = 'dqn_pong'; save_file = 'dqn_' + game print(env.action_space) print(env.get_action_meanings()) model = DQN(MlpPolicy, env, verbose=1) #model = DQN.load(save_file) model.set_env(env) # model = DQN(CnnPolicy, env, verbose=1) model.learn(total_timesteps=50000, log_interval=10) # model.save(save_file) obs = env.reset() score = 0 rewards_sum = 0 while True: # print(score) action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() score = score + 1 rewards_sum += reward if reward > 0:
# Set up tensorboard logger if args.tensorboard: log_callback = LoggerCallback(sinergym_logger=bool(args.logger)) callbacks.append(log_callback) # lets change default dir for TensorboardFormatLogger only tb_path = args.tensorboard + '/' + name new_logger = configure(tb_path, ["tensorboard"]) model.set_logger(new_logger) callback = CallbackList(callbacks) # ---------------------------------------------------------------------------- # # TRAINING # # ---------------------------------------------------------------------------- # model.learn(total_timesteps=timesteps, callback=callback, log_interval=args.log_interval) model.save(env.simulator._env_working_dir_parent + '/' + name) # If the algorithm doesn't reset or close the environment, this script will do it in # order to correctly log all the simulation data (Energyplus + Sinergym # logs) if env.simulator._episode_existed: env.close() # ---------------------------------------------------------------------------- # # Mlflow artifacts storege # # ---------------------------------------------------------------------------- # if args.mlflow_store: # Code for send output and tensorboard to mlflow artifacts. mlflow.log_artifacts(local_dir=env.simulator._env_working_dir_parent,
def key_handler(event): """ Accepts a key event and makes an appropriate decision. :param event: Key event :return: void """ global _root global _routing_canvas global _rl_model global _is_first_step global _rl_env global _rl_target_cell global _step_count global LEARN_RATE global EXPLORE_INIT global EXPLORE_FINAL global GAMMA global TRAIN_TIME_STEPS global LOAD_MODEL_NAME e_char = event.char if e_char == 'l': # RL Agent Learning pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() # RL Agent _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT, exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA) print("Beginning RL training") _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS) print("Finished RL training") print("Saving trained model") _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S")) elif e_char == 't': # RL Agent Testing pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() print("Loading trained model") if _rl_model is None: _rl_model = DQN.load(LOAD_MODEL_NAME) obs = _rl_env.reset() done = False while not done: rl_action, states = _rl_model.predict(obs, deterministic=True) print("Action " + str(rl_action)) obs, rewards, done, info = _rl_env.step(rl_action) elif e_char == 'r': # RL flow debugging (no agent involved, emulate actions randomly) if _is_first_step: _rl_env.reset() _is_first_step = False else: rand_action = random.randrange(1) rl_action_step(rand_action) else: pass
buffer_size=500000, max_grad_norm=10, exploration_fraction=0.1, exploration_final_eps=0.01, device="cuda", tensorboard_log="./tb_logs/", ) # Create an evaluation callback with the same env, called every 10000 iterations callbacks = [] eval_callback = EvalCallback( env, callback_on_new_best=None, n_eval_episodes=5, best_model_save_path=".", log_path=".", eval_freq=10000, ) callbacks.append(eval_callback) kwargs = {} kwargs["callback"] = callbacks # Train for a certain number of timesteps model.learn(total_timesteps=5e5, tb_log_name="dqn_airsim_car_run_" + str(time.time()), **kwargs) # Save policy weights model.save("dqn_airsim_car_policy")
dm, y_oracle = init_dm(CONFIG) print(dm) env = ClassificationEnv(dm, y_oracle) sys.path.insert(0, 'dral') if new: model = DQN(CnnPolicy, env, verbose=1, learning_rate=2e-4, gamma=0.98, batch_size=32, learning_starts=3000) if load: model = DQN.load("data/rl_query_rps.pth") if test: model = init_and_train_rl_classification_model( timesteps=100000, path='data/rl_query_dogs_cats.pth') # show_grid_imgs(dm.test.get_x(list(range(9))), dm.test.get_y(list(range(9))), (3, 3)) n_episodes = 5 for k in range(n_episodes): # label images y_oracle = label_samples(dm, y_oracle, n=100, random=True) dm.train.shuffle() print(dm) model.learn(total_timesteps=6000, log_interval=30) # evaluation env.enable_evaluating(True) evaluate(model, env) env.enable_evaluating(False)
policy_kwargs = dict(activation_fn=torch.nn.Tanh, net_arch=[dict(pi=[1024, 1024], vf=[1024, 1024])]) if(args.dqn): args.name = 'DQN_' + args.name model = DQN('MlpPolicy', gym.make('Trading-v2'), verbose = 1, device = torch.device('cpu'), tensorboard_log = './runs/') else: model = PPO('MlpPolicy', make_vec_env('Trading-v2', 8), verbose = 1, device = torch.device('cpu'), tensorboard_log = './runs/') model.learn(total_timesteps = 20e6, tb_log_name = args.name, callback = CheckpointCallback(save_freq = 10000, save_path = "./trained_models", name_prefix = args.name)) model.save('{}_trading_sb'.format('dqn' if args.dqn else 'ppo')) else: print('Loading agent') if(args.dqn): model = DQN.load('dqn_trading_sb') else: model = PPO.load('ppo_trading_sb') # model = PPO('MlpPolicy', env, verbose = 1) eval_eps = 100 pbar = tqdm(total = eval_eps) env = gym.make('Trading-v0') rewards = []
class TradingAgent: def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs): # wrapper around stable_baselines RL implemenetations assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS) if model == 'a2c': self.rl = A2C(**kwargs) elif model == 'ppo': self.rl = PPO(**kwargs) elif model == 'dqn': self.rl = DQN(**kwargs) elif model == 'td3': self.rl = TD3(**kwargs) self.use_gp = use_gp if self.use_gp: assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR' self.n_train = gp_params['n_train'] self.retraining_iter = gp_params['training_iter'] self.cvar_limit = gp_params['cvar_limit'] self.gp_limit = gp_params['gp_limit'] self.likelihood = gpytorch.likelihoods.GaussianLikelihood() if 'data' in gp_params.keys(): self.X_train = gp_params['data']['X_train'] self.y_train = gp_params['data']['y_train'] else: self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features self.y_train = torch.zeros(self.n_train) self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood) self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp) self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1) self.shares = 0 self.cash = 0 self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned # for plotting self.pred_return = 0 self.pred_lower = 0 self.pred_upper = 0 # for debugging self.goal_num_shares = 0 def learn(self, n_steps): # when using gp, load pretrained rl agent - no need to train if self.use_gp: # train GP using fixed number of steps self.__train_gp(100) else: # train RL agent self.rl.learn(n_steps) def predict(self, obs, deterministic): action, state = self.rl.predict(obs, deterministic=deterministic) if self.use_gp: # slightly retrain self.__train_gp(self.retraining_iter, retrain=True) # predict next step returns and CI using GP with torch.no_grad(), gpytorch.settings.fast_pred_var(): output = self.gp(torch.Tensor(obs[2:])[None]) obs_pred = self.likelihood(output) f_mean = output.mean.detach().numpy()[0] self.pred_return = f_mean.item() f_samples = output.sample(sample_shape=torch.Size((10000,))).detach().numpy() lower, upper = obs_pred.confidence_region() self.pred_lower = lower.item() self.pred_upper = upper.item() rl_action = action action -= ACTION_OFFSET # adjust from action for env to see actual trade # adjust trade size given prediction # if self.shares > 0: # long position if f_mean > self.gp_limit: # predict positive return over certain threshold tail_samples = f_samples[f_samples < lower.item()] ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else lower.item() # cvar per share if ps_cvar < 0: goal_num_shares = self.cvar_limit // ps_cvar else: goal_num_shares = self.shares + action # positive return for long - no adjustment needed action = min(10, max(0, goal_num_shares - self.shares)) elif f_mean < -self.gp_limit: tail_samples = f_samples[f_samples > upper.item()] ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else upper.item() # cvar per share if ps_cvar < 0: goal_num_shares = self.shares + action # negative return for short - no adjustment needed else: goal_num_shares = self.cvar_limit // ps_cvar action = max(-10, min(0, goal_num_shares - self.shares)) else: goal_num_shares = self.shares + action # print(ps_cvar, lower.item(), upper.item()) # if not np.isnan(goal_num_shares): self.goal_num_shares = goal_num_shares # if action > 0: # buy order # action = min(10, max(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction # # print(goal_num_shares - self.shares, action) # elif action < 0: # sell order # action = max(-10, min(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction action += ACTION_OFFSET # adjust for env actions being 1 to N rather than -N/2 to N/2 # print(f_mean, ps_cvar, self.shares, goal_num_shares, rl_action-ACTION_OFFSET, action-ACTION_OFFSET) return action, state def update(self, obs, reward=None): self.obs.append(obs) self.shares, self.cash = obs[:2] if reward is not None: self.X_train = torch.cat((self.X_train, torch.Tensor(self.obs.pop(0)[2:])[None]))[1:] # self.X_train[1:] self.y_train = torch.cat((self.y_train, torch.Tensor([reward])))[1:] # print(self.X_train, self.y_train) self.gp.set_train_data(self.X_train, self.y_train) def save(self, rl_path, gp_path=None): self.rl.save(rl_path) if gp_path is not None: torch.save(self.gp.state_dict(), gp_path) def load(self, rl_path, gp_path=None): self.rl = A2C.load(rl_path) if gp_path is not None: state_dict = torch.load(gp_path) self.gp.load_state_dict(state_dict) def __train_gp(self, n_iter, retrain=False): # train GP using fixed number of steps self.gp.train() self.likelihood.train() for i in range(n_iter): output = self.gp(self.X_train) loss = -self.mll(output, self.y_train) self.opt.zero_grad() loss.backward() self.opt.step() self.gp.eval() self.likelihood.eval()
from stable_baselines3.common.evaluation import evaluate_policy env = gym.make('CartPole-v0') model = DQN(MlpPolicy, env, verbose=1) #model = DQN(MlpPolicy, env, seed=1423, target_update_interval =5, batch_size=16, train_freq=128, buffer_size=256, gamma=0.95, learning_rate=1e-3, verbose=1) print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("start model learning !") model.learn(total_timesteps=10000, log_interval=4) print("end model learning !") print("-> model saved !!") model.save("dqn_cartpole") print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("-> model evaluation without learning") print( f"mean_reward:{mean_reward_before:.2f} +/- std_reward:{std_reward_before:.2f}" )
import sys from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.dqn.policies import MlpPolicy from stable_baselines3 import DQN from gym_sudoku.envs.sudoku_env import SudokuEnv env = SudokuEnv() if "--train" in sys.argv: model = DQN(MlpPolicy, env, verbose=1, learning_starts=100) model.learn(total_timesteps=10000) model.save("dqn_sudoku") else: model = DQN.load("dqn_sudoku") obs = env.reset() env.render() for _ in range(20): action, _states = model.predict(obs, deterministic=True) print("Action", action) print("States", _states) print("Coordinates", env.fill_pointer) obs, rewards, done, info = env.step(action) env.render() if done: print("Resetting ==============================================>") obs = env.reset()