def _make_agents(env, model_dir, agent_name, op_name=None): # Models if agent_name is None: model = SimpleRuleBasedAgent(env) else: model = DQN.load(model_dir + agent_name) if op_name is None: op = SimpleRuleBasedAgent(env) else: op = DQN.load(model_dir + op_name) return model, op
def basic_usage_example(): # Basic Usage: Training, Saving, Loading. # Create environment. env = gym.make("LunarLander-v2") # Instantiate the agent. model = DQN("MlpPolicy", env, verbose=1) # Train the agent. model.learn(total_timesteps=int(2e5)) # Save the agent. model.save("dqn_lunar") del model # Delete trained model to demonstrate loading. # Load the trained agent. # NOTE: if you have loading issue, you can pass 'print_system_info=True' # to compare the system on which the model was trained vs the current one. #model = DQN.load("dqn_lunar", env=env, print_system_info=True) model = DQN.load("dqn_lunar", env=env) # Evaluate the agent. # NOTE: If you use wrappers with your environment that modify rewards, # this will be reflected here. To evaluate with original rewards, # wrap environment in a "Monitor" wrapper before other wrappers. mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) # Enjoy trained agent. obs = env.reset() for i in range(1000): action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render()
def run_experiment(env, experiment_id, episodes=EPISODES, visualise=False): evaluation_results, histories = [], [] models = [ NoneAgent(env), RandomAgent(env), HeuristicAgent(env), DQN.load(MODEL_PATH) ] model_names = ['baseline_1', 'baseline_2', 'baseline_3', 'dqn_model'] env.experiment = experiment_id for model, model_id in zip(models, model_names): print(f'EXPERIMENT: {experiment_id}, MODEL: {model_id}') evaluation_result, history = run_evaluation(env, model, episodes, visualise) evaluation_result['model'] = model_id evaluation_result['experiment'] = experiment_id evaluation_results.append(evaluation_result) history['model'] = model_id history['experiment'] = experiment_id histories.append(history) return pd.concat(evaluation_results), pd.concat(histories)
def main(save_video=False, num_eps=1, render=False, attack=None, save_perturbed_img=False): pong_duel.AGENT_COLORS[1] = 'red' # Initialize environment env = gym.make('PongDuel-v0') env = RewardZeroToNegativeBiAgentWrapper(env) if save_video: env = Monitor(env, './output/recordings', video_callable=lambda episode_id: True, force=True) # env = RewardZeroToNegativeBiAgentWrapper(env) # env = ObservationVectorToImage(env, 'both') env = ObserveOpponent(env, 'both') env = MAGymCompatibilityWrapper(env, image_observations='none') model_dir = '../../output/gcp-models/' # Models op_name = 'gcp-feature-based-op-obs7.out' model = WhiteBoxMonteCarloAgent(env, num_sims=10, sim_max_steps=2000) op = DQN.load(model_dir + op_name) env.set_opponent(op) avg_reward, total_steps = evaluate(model, env, attack=attack, slowness=0.05, num_eps=num_eps, render=render, save_perturbed_img=save_perturbed_img) print(avg_reward) print(total_steps)
def main(): policy = DQN.load('experiments/multirun/grid_sweep/2021-03-07/14-13-11/1/ckpts/rl_model_5000000_steps.zip') # load the environment using the yaml that was used for training: cfg = OmegaConf.load('experiments/multirun/grid_sweep/2021-03-07/14-13-11/1/.hydra/config.yaml') env_cfg = cfg.env env_cfg.render = True env = Expando(**env_cfg) obs_0 = env.reset() for i in range(10000): action_0 = policy.predict(obs_0)[0][0] obs_0, reward, done, info = env.step(action_0) env.render()
def run_sensitivity(): model = DQN.load(MODEL_PATH) results = [] for policy in [0.1, 0.25, 0.5, 1, 2]: config = { 'simulation_frequency': 15, 'demand_amplitude': 15000, 'total_steps': 100, 'policy_frequency': policy } env = gym.make('highway-v0', **config) result, history = run_episode(env, model, False) results.append(result) results_df = pd.DataFrame(results) results_df.to_csv(SENSITIVITY_FILENAME)
def test_dqn(): log_dir = f"model_save/best_model_dqn" env = ENV_DISCRETE(istest=True) env.render = True env = Monitor(env, log_dir) model = DQN.load(log_dir) plot_results(f"model_save/") for i in range(10): state = env.reset() day = 0 while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",day,"reward:", reward,"now profit:",env.profit) day += 1 if done: print('stock', i, ' total profit=', env.profit, ' buy hold=', env.buy_hold) break
def evaluate(params): # Load saved model model = DQN.load(exp_name, env=env) results = np.zeros(shape=(0, 0)) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(params.get("test_episodes")): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) episode_reward += reward if done or info.get('is_success', False): episode_reward = 0.0 obs = env.reset() result = ("Reward:", episode_reward, "Success?", info.get('is_success', True)) results = np.append(results, result, axis=None)
def main(save_video=False, num_eps=1, render=True, attack=None, save_perturbed_img=False): # Initialize environment env = gym.make('PongDuel-v0') if save_video: env = Monitor(env, './output/recordings', video_callable=lambda episode_id: True, force=True) # env = RewardZeroToNegativeBiAgentWrapper(env) env = ObserveOpponent(env, 'both') env = MAGymCompatibilityWrapper(env, image_observations='none') model_dir = '../../output/gcp-models/' # Models agent_name = "gcp-feature-based-op-obs7.out" victim = DQN.load(model_dir + agent_name) adv = WhiteBoxAdversarialAgent(env, victim, victim_type='sb3') env.set_opponent(victim) avg_reward, _ = evaluate(adv, env, attack=attack, slowness=0.05, num_eps=num_eps, render=render, save_perturbed_img=save_perturbed_img, ) print(avg_reward)
screen.fill((0,0,0)) screen.blit(pygame.surfarray.make_surface(resize(reconstruct[0].transpose(1, 0, 2), 600)), (0, 0)) pygame.display.flip() for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() # real speed state[32] += args.safety # abs 1-4 state[34:38] += args.safety # action = np.argmax(model.forward(state)) action, _states = model.predict(state, deterministic=True) state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) env = make_env()() if args.load_from is not None: print("loading model", args.load_from) model = DQN.load(args.load_from) evaluate(model, env)
# by frank tian, 2021-1-14 from stable_baselines3 import DQN import gym_flappy_bird import gym import os env = gym.make("FlappyBird-v0", is_demo=True) obs = env.reset() model = DQN.load(os.path.join(os.path.dirname(__file__), 'logs/best_model.zip')) if __name__ == "__main__": rewards = 0 time_steps = 0 while True: # action = env.action_space.sample() action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) rewards += reward time_steps += 1 env.render() if done: obs = env.reset() print("rewards: {}, of {} steps".format(rewards, time_steps)) rewards = 0 time_steps = 0
env = VecFrameStack( env, n_stack=custom_params['FRAME_STACK']) # Use 1 for now because we use image if not custom_params['USING_VAE']: env = VecTransposeImage(env) # Uncomment if using 3d obs if custom_params['USING_NORMALIZATION']: env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"), env) # Load the agent if custom_params['algo'] == 'sac': model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'a2c': model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'dqn': model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'ppo': model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip")) else: raise ValueError("Error model") # Load the saved statistics # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False obs = env.reset() steps = 0 rewards = 0
def __init__(self): self.env = DQNAgent.create_env(1) self.model = DQN.load(MODEL_PATH)
import sys from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.dqn.policies import MlpPolicy from stable_baselines3 import DQN from gym_sudoku.envs.sudoku_env import SudokuEnv env = SudokuEnv() if "--train" in sys.argv: model = DQN(MlpPolicy, env, verbose=1, learning_starts=100) model.learn(total_timesteps=10000) model.save("dqn_sudoku") else: model = DQN.load("dqn_sudoku") obs = env.reset() env.render() for _ in range(20): action, _states = model.predict(obs, deterministic=True) print("Action", action) print("States", _states) print("Coordinates", env.fill_pointer) obs, rewards, done, info = env.step(action) env.render() if done: print("Resetting ==============================================>") obs = env.reset()
verbose = 1, device = torch.device('cpu'), tensorboard_log = './runs/') else: model = PPO('MlpPolicy', make_vec_env('Trading-v2', 8), verbose = 1, device = torch.device('cpu'), tensorboard_log = './runs/') model.learn(total_timesteps = 20e6, tb_log_name = args.name, callback = CheckpointCallback(save_freq = 10000, save_path = "./trained_models", name_prefix = args.name)) model.save('{}_trading_sb'.format('dqn' if args.dqn else 'ppo')) else: print('Loading agent') if(args.dqn): model = DQN.load('dqn_trading_sb') else: model = PPO.load('ppo_trading_sb') # model = PPO('MlpPolicy', env, verbose = 1) eval_eps = 100 pbar = tqdm(total = eval_eps) env = gym.make('Trading-v0') rewards = [] baseline_diff = [] for ep in range(eval_eps): done = False ep_reward = 0 s = env.reset() while not done:
self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU()) def forward(self, observations: th.Tensor) -> th.Tensor: return self.linear(self.cnn(observations)) policy_kwargs = dict( features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=128), ) if os.path.isfile(agentPath): print(f"Load agent from {agentPath}") # model = PPO.load(agentPath) model = DQN.load(agentPath) model.set_env(env) else: print(f"Instanciate new agent and save in {agentPath}") # model = PPO("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1) # model = DQN("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1) model = DQN("CnnPolicy", env, target_update_interval=1000, batch_size=512, exploration_final_eps=0.2, policy_kwargs=policy_kwargs, verbose=1) model.save(agentPath) # Record gif of trained agent
model.learn(total_timesteps=training_timesteps, log_interval=1, callback=[callback]) npz = np.load(train_log_dir + '/log.npz') df = pd.DataFrame.from_dict({item: npz[item] for item in npz.files}) print( 'Train Profit Factor:', df['Index'].loc[(df['State'] != 'Flat') & (df['Profit'] > 0)].count() / df['Index'].loc[(df['State'] != 'Flat') & (df['Profit'] < 0)].count()) df.to_csv(train_log_dir + '/Train_log.csv') if TEST: model = DDQN.load(log_dir + '/best_model/' + MODEL_NAME) env = gym.make('rl_stocks-v0') env._reset(actions=N_DISCRETE_ACTIONS, observation_space=OBSERVATION_SPACE_TEST, data=test_df, trade_amount=TRADE_AMOUNT, key=KEY, wallet=WALLET, window=WINDOW, interest_rate=INTEREST_RATE, log_dir=test_log_dir) for step in range(testing_timesteps): obs = env.reset() action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action)
model.save(model_name) if ALGORITHM == 'PPO': from stable_baselines3.ppo import MlpPolicy model = PPO(MlpPolicy, env, tensorboard_log=log_dir, verbose=2) model.learn(total_timesteps=NUM_EPISODES * MAX_STEPS, tb_log_name=log_name, log_interval=1) model.save(model_name) else: print('!ERROR: incorrect algorithm selection!') del model else: model_name = '02-08-2021_' + ALGORITHM + '_scarecrow' if ALGORITHM == 'DQN': model = DQN.load(model_name) if ALGORITHM == 'PPO': model = PPO.load(model_name) obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
env = DummyVecEnv([ lambda: Monitor( gym.make( "airgym:airsim-drone-sample-v0", ip_address="127.0.0.1", step_length=1, image_shape=(84, 84, 1), destination=np.array([300, 0, -40]), )) ]) # Wrap env as VecTransposeImage to allow SB to handle frame observations env = VecTransposeImage(env) model = DQN.load("model/dqn_airsim_drone_policy_4actions_30000_steps_cont.zip", env=env) # Initialize RL algorithm type and parameters # model = DQN( # "CnnPolicy", # env, # learning_rate=0.00025, # verbose=1, # batch_size=32, # train_freq=4, # target_update_interval=200, # learning_starts=200, # buffer_size=10000, # max_grad_norm=10, # exploration_fraction=0.1, # exploration_final_eps=0.01,
def key_handler(event): """ Accepts a key event and makes an appropriate decision. :param event: Key event :return: void """ global _root global _routing_canvas global _rl_model global _is_first_step global _rl_env global _rl_target_cell global _step_count global LEARN_RATE global EXPLORE_INIT global EXPLORE_FINAL global GAMMA global TRAIN_TIME_STEPS global LOAD_MODEL_NAME e_char = event.char if e_char == 'l': # RL Agent Learning pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() # RL Agent _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT, exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA) print("Beginning RL training") _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS) print("Finished RL training") print("Saving trained model") _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S")) elif e_char == 't': # RL Agent Testing pass # AI Gym Environment check - only do this when testing a new environment (resets RNG seed) # check_env(_rl_env) _step_count = 0 # Reset because check_env increments via step() print("Loading trained model") if _rl_model is None: _rl_model = DQN.load(LOAD_MODEL_NAME) obs = _rl_env.reset() done = False while not done: rl_action, states = _rl_model.predict(obs, deterministic=True) print("Action " + str(rl_action)) obs, rewards, done, info = _rl_env.step(rl_action) elif e_char == 'r': # RL flow debugging (no agent involved, emulate actions randomly) if _is_first_step: _rl_env.reset() _is_first_step = False else: rand_action = random.randrange(1) rl_action_step(rand_action) else: pass
import numpy as np import gym import gym_fishing from stable_baselines3 import DQN from stable_baselines3.common.env_checker import check_env env = gym.make('fishing-v0') check_env(env) model = DQN('MlpPolicy', env, verbose=1) model.learn(total_timesteps=200) ## Simulate a run with the trained model, visualize result df = env.simulate(model) env.plot(df, "dqn.png") ## Evaluate model from stable_baselines3.common.evaluation import evaluate_policy mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5) print("mean reward:", mean_reward, "std:", std_reward) ## Save and reload the model model.save("dqn") model = DQN.load("dqn")
from stable_baselines3 import DQN, PPO, A2C from stable_baselines3.common.cmd_util import make_vec_env from stable_baselines3.common.evaluation import evaluate_policy # Instantiate the env env = ABCEnv() # wrap it env = make_vec_env(lambda: env, n_envs=1) # Train the agent """ Something you might want to play around with, learning_rate, total timesteps etc.. Always choose a sample efficient algorithm """ total_timesteps = 200 model = DQN('MlpPolicy', env, verbose=1, tensorboard_log="./CSC2547_tensorboard/") model.learn(total_timesteps) model_name = "DQN_timesteps_" + str(total_timesteps) model.save(model_name) model.load(model_name, env=env) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=2) print("mean_reward is: ", mean_reward) print("std_reward is: ", std_reward)
new = 1 load = 0 test = 0 if new + load + test != 1: raise Exception('Initialize new, train or load a model') dm, y_oracle = init_dm(CONFIG) print(dm) env = ClassificationEnv(dm, y_oracle) sys.path.insert(0, 'dral') if new: model = DQN(CnnPolicy, env, verbose=1, learning_rate=2e-4, gamma=0.98, batch_size=32, learning_starts=3000) if load: model = DQN.load("data/rl_query_rps.pth") if test: model = init_and_train_rl_classification_model( timesteps=100000, path='data/rl_query_dogs_cats.pth') # show_grid_imgs(dm.test.get_x(list(range(9))), dm.test.get_y(list(range(9))), (3, 3)) n_episodes = 5 for k in range(n_episodes): # label images y_oracle = label_samples(dm, y_oracle, n=100, random=True) dm.train.shuffle() print(dm) model.learn(total_timesteps=6000, log_interval=30)
lambda: Monitor( gym.make( "airgym:airsim-drone-sample-v0", ip_address="127.0.0.1", step_length=1, image_shape=(84, 84, 1), destination=np.array([70, -5, -20]), )) ]) # Wrap env as VecTransposeImage to allow SB to handle frame observations env = VecTransposeImage(env) # model = DQN.load("model/dqn_airsim_drone_policy") model = DQN.load( "checkpoint/v18_dqn_cnnPolicy_4actions_imageObs_100000_steps/dqn_policy_65000_steps.zip" ) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1, deterministic=True) print(f"mean_reward = [{mean_reward:.2f}] +/- {std_reward}") # obs = env.reset() # while True: # action, _states = model.predict(obs, deterministic=True) # obs, reward, done, info = env.step(action) # if done: # obs = env.reset()
print('Envrionment Setup...') env = gym.make('Desktop-v0', debug=False, show=True, steplimit=100) outdir = '/tmp/random-agent-results' env = Monitor(env, directory=outdir, force=True) episodes = 10 # Setup Agent print('Agent Setup...') model = DQN(MlpPolicy, env, verbose=0, buffer_size=500) print('Returning Trained Model...') model.learn(total_timesteps=1000, log_interval=4) print('Saving Trained Model...') model.save("deepq_desktop") del model # remove to demonstrate saving and loading print('Loading Trained Model...') model = DQN.load("deepq_desktop") def unique_reward(last_state, current_state): # rewards a current state that is different from the last state return (np.sum(last_state) - np.sum(current_state)) if __name__ == '__main__': try: print('Running Environment') last_state = None # Run Environment for episode in range(episodes): print('Episode:', episode) obs = env.reset()
def learn_with_selfplay(max_agents, num_learn_steps, num_learn_steps_pre_training, num_eval_eps, num_skip_steps=0, model_name='dqn', only_rule_based_op=False, patience=5, image_observations=True, output_folder="output", fine_tune_on=None, opponent_pred_obs=False, adversarial_training=None, save_freq=None): """ Train an agent with regular self-play. If there are checkpoints of previous training continue training with the checkpoints. :param max_agents: Stop after max_agents intermediate agents have been trained. An intermediate agent is saved when training successfully created an improved agent. :param num_learn_steps: Number of frames / steps for every learning iteration :param num_learn_steps_pre_training: Number of frames / steps for pre-training on the rule-based agent :param num_eval_eps: Number of episodes for intermediate evaluation. Intermediate evaluation determines whether trained agent improved compared to previous version :param num_skip_steps: Skip num_skip_steps frames performing the action from the previous step :param model_name: Name for saving the model. If there are already checkpoints with this name training is continued. Checkpoints will be saved as madel_namei, where i is the training iteration. :param only_rule_based_op: If set to true training is only performed on the rule-based agent. :param patience: Patience parameter for evaluation :param image_observations: Use image instead of feature observations :param output_folder: Root folder for outputs :param fine_tune_on: If not None instead of self-play training perform training of an adversarial policy against the victim specified as string to this parameter :param opponent_pred_obs: If this is set to True, the predictions of the opponents in the current state will beconcatenated to the observations for the main agent. This was an attempt to create a stronger adversarial policy, which could use this information, however in our experiments this didn't improve the adversarial policy :param adversarial_training: If set to True perform adversarial training using FGSM during training. :param save_freq: If not None save intermediate checkpoints during training with the given frequency :return: """ eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based = _init_envs(image_observations, num_skip_steps, opponent_pred_obs, adversarial_training) # If fine tuning, load model to fine-tune from path if fine_tune_on is not None: path = Path(output_folder) / 'models' / fine_tune_on fine_tune_model = DQN.load(path) fine_tune_model.tensorboard_log = None if opponent_pred_obs: # We can't eval on agents that don't have a q_net so we change eval_op to the original model that is being # fine-tuned against, instead of the rule-based agent eval_op = fine_tune_model eval_env_rule_based.set_opponent(eval_op) eval_env_rule_based = OpponentPredictionObs(eval_env_rule_based) eval_env.set_opponent(eval_op) eval_env = OpponentPredictionObs(eval_env) else: fine_tune_model = None # Initialize first agent pre_train_agent = SimpleRuleBasedAgent(train_env_rule_based) previous_models = [pre_train_agent] # Load potentially saved previous models for opponent_id in range(1, max_agents): path = _make_model_path(output_folder, model_name, opponent_id) if os.path.isfile(path): model = DQN.load(path) previous_models.append(model) else: break # Initialize first round last_agent_id = len(previous_models) - 1 prev_num_steps = 0 patience_counter = 0 tb_path = Path(output_folder) / "tb-log" if last_agent_id == 0: # main_model = A2C('MlpPolicy', policy_kwargs=dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5)), env=train_env, verbose=0, # tensorboard_log="output/tb-log") # main_model = A2C('MlpPolicy', train_env, verbose=0, tensorboard_log="output/tb-log") # , exploration_fraction=0.3) main_model = DQN('MlpPolicy', train_env_rule_based, verbose=0, tensorboard_log=tb_path) # , exploration_fraction=0.3) else: main_model = copy.deepcopy(previous_models[last_agent_id]) main_model.set_env(train_env) main_model.tensorboard_log = tb_path # Start training with self-play over several rounds opponent_id = last_agent_id while opponent_id < max_agents - 1: print(f"Running training round {opponent_id + 1}") if fine_tune_on is None: # Choose opponent based on setting if only_rule_based_op: current_train_env = train_env_rule_based # Use rule-based as opponent current_train_env.set_opponent(SimpleRuleBasedAgent(current_train_env)) else: if opponent_id == 0: current_train_env = train_env_rule_based else: current_train_env = train_env # Take opponent from the previous version of the model current_train_env.set_opponent(previous_models[opponent_id]) else: # Use passed fine-tune agent as opponent current_train_env = train_env current_train_env.set_opponent(fine_tune_model) # Train the model current_train_env.set_opponent_right_side(True) chosen_n_steps = num_learn_steps_pre_training if opponent_id == 0 else num_learn_steps # Iteration 0 is pre-training # In order to generate adversarial examples the adversarial training wrapper needs a references to the model that is # currently being trained if adversarial_training is not None: current_train_env.env.victim_model = main_model # Optionally add a callback to save intermediate checkpoints if save_freq is not None: checkpoint_callback = CheckpointCallback(save_freq=save_freq, save_path='./output/intermediate/', name_prefix=model_name + str(opponent_id + 1) + '_interm') else: checkpoint_callback = None # === LEARNING === main_model.learn(total_timesteps=chosen_n_steps, tb_log_name=model_name, callback=checkpoint_callback) # Do evaluation for this training round eval_env_rule_based.set_opponent(eval_op) avg_round_reward, num_steps = evaluate(main_model, eval_env_rule_based, num_eps=num_eval_eps) print(model_name) print(f"Average round reward after training: {avg_round_reward}") print(f"Average number of steps per episode: {num_steps / num_eval_eps}") # Check if there was improvement if num_steps > prev_num_steps: # Model improved compared to last print('Model improved') prev_num_steps = num_steps # Reset patience counter patience_counter = 0 # Save the further trained model to disk main_model.save(_make_model_path(output_folder, model_name, opponent_id + 1)) # Make a copy of the just saved model by loading it copy_of_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id + 1)) # Save the copy to the list previous_models.append(copy_of_model) # From here we continue training the same main_model against itself opponent_id += 1 else: print('Model did not improve') patience_counter += 1 # Do not save the model if patience_counter > patience: print('Stopping early due to patience') break # Because our model did not improve compared to the previous one, we reset our main_model to the previous one main_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id)) main_model.set_env(train_env) # Opponent does not change if not opponent_pred_obs: # Evaluate the last model against each of its previous iterations # evaluate_against_predecessors(previous_models, env_rule_based=eval_env_rule_based, env_normal=eval_env, num_eval_eps=num_eval_eps) pass # Not useful right now
else: model_path = '' if 'gs://' in args.model: # Download from given bucket (gcloud configured with privileges) client = gcloud.init_storage_client() bucket_name = args.model.split('/')[2] model_path = args.model.split(bucket_name + '/')[-1] gcloud.read_from_bucket(client, bucket_name, model_path) model_path = './' + model_path else: model_path = args.model model = None if args.algorithm == 'DQN': model = DQN.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'DDPG': model = DDPG.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'A2C': model = A2C.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'PPO': model = PPO.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'SAC': model = SAC.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'TD3': model = TD3.load(model_path, tensorboard_log=args.tensorboard) else: raise RuntimeError('Algorithm specified is not registered.') model.set_env(env)
model = PPO('MlpPolicy', env=env, verbose=1) model.learn(total_timesteps=timesteps) model.save("model_cups") def act(env, model): # env is deterministic as in if I say "go right" the gripper will go right all the time. obs = env.reset() for i in range(100): env.render() action, _states = model.predict(obs, deterministic=True) # print(action) obs, reward, done, info = env.step(action) if done: print('[FINAL] obs=', obs, 'reward=', reward, 'done=', done) break type = "DQN" TIME_STEPS = 50000 env = gym.make('CupsWorld-v0') # train(env, type, TIME_STEPS) if type == "A2C": model = A2C.load('model_cups') elif type == "DQN": model = DQN.load('model_cups') elif type == "PPO": model = PPO.load('model_cups') act(env, model) env.close()
#env_eval = Monitor(env, './logs/') eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=1000, deterministic=True, render=False) #Deeper NN #model = DQN.load("DQN", env=env) model.learn(total_timesteps=5_000_000, callback=eval_callback) # Typically not enough model.save("DQN") #model = DQN.load("DQN", env=env) model = DQN.load("logs/best_model", env=env) #model = PPO.load("PPO_discrete", env=env) logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS), num_drones=ARGS.num_drones) obs = env.reset() start = time.time() n_trial = 0 for i in range(ARGS.duration_sec * env.SIM_FREQ): if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0: action, _states = model.predict( obs, deterministic=True, ) #else: # action = np.array([1,0,0]) #No Turn
}, "policy_frequency": 2, "duration": 40, }) env.reset() model = DQN('CnnPolicy', env, gamma=0.8, learning_rate=5e-4, buffer_size=40*1000, learning_starts=200, exploration_fraction=0.6, target_update_interval=256, batch_size=32, verbose=1, tensorboard_log="logs/") model.learn(total_timesteps=int(2e5)) model.save("dqn_highway") # Record video model = DQN.load("dqn_highway") env.configure({"policy_frequency": 15, "duration": 20 * 15}) video_length = 2 * env.config["duration"] env = VecVideoRecorder(env, "videos/", record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix="dqn-agent") obs = env.reset() for _ in range(video_length + 1): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) env.close()