env = DummyVecEnv([lambda: env]) # ベクトル環境の生成 print('行動空間: ', env.action_space) print('状態空間: ', env.observation_space) return env env = make_env() custom_callback = CustomCallback(env, render=True) model = PPO2(policy=CnnPolicy, env=env, verbose=0, learning_rate=0.000025, tensorboard_log=log_dir) model = PPO2.load('./agents/best_mario_ppo2model', env=env, verbose=0) state = env.reset() total_reward = 0 while True: # 環境の描画 env.render() # スリープ time.sleep(1 / 25) # モデルの推論 action, _ = model.predict(state) # 1ステップ実行 state, reward, done, info = env.step(action)
import gym import gym_LoL from stable_baselines.common.policies import MlpLstmPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 from notify_run import Notify if __name__ == "__main__": notify = Notify(endpoint='https://notify.run/1GQn88vSML1rmxdz') model = None model_path = 'ppo_lol' try: env = DummyVecEnv([lambda: gym.make('LoL-v0')]) try: model = PPO2.load(model_path, env) except ValueError: model = PPO2(MlpLstmPolicy, env, verbose=1, nminibatches=1) for i in range(100): model.learn(total_timesteps=2500) model.save(model_path) except (KeyboardInterrupt, SystemExit): raise except: model.save(model_path) notify.send('Training Failed for LoL-v0') raise else: notify.send('Training Completed for LoL-v0')
def run_experiment( not_save=False, folder='experiments', weights_location=None, tag=None, env='Base', env_num=4, n=0, save_interval=10000, train_steps=int(1e6), description=None, weights=None, n_steps=200, gamma=0.99, ): env_name = env if weights is not None and not os.path.isfile(weights): raise ValueError("Weights do not exist") # Saving args args = deepcopy(locals()) # Get env env = getattr(environments, env) envs = DummyVecEnv([lambda : env() for i in range(env_num)]) args['env_config'] = ""#str(env.get_org_config()) # Check if folder exists and if is a valid name if not not_save: id,logger,logs_folder,experiment_csv,experiment_folder = \ create_experiment_folder(folder=folder,tag=tag,args=args) else: id = -1 logs_folder= None logger = None experiment_folder = None if weights is not None: model = PPO2.load( weights, verbose=0, tensorboard_log=logs_folder, max_grad_norm=100, n_steps=n_steps, gamma=gamma, ) model.set_env(envs) else: model = PPO2( CnnPolicy, envs, verbose=0, tensorboard_log=logs_folder, max_grad_norm=100, n_steps=n_steps, ) if 'interrupting' in env_name: for env in envs.envs: env.set_interrupting_params(ppo=model) # set bar callback = Callback( not_save=not_save, logger=logger, train_steps=train_steps, n=n, experiment_folder=experiment_folder, save_interval=save_interval, id=id, ) # Start running experiment # Creating nice table _width = 40 del args['env_config'] max_k_width = max([len(k) for k in args]) print("\n{}".format("#"*_width)) print("# {1:^{0}} #".format(_width-4, "RUNNING EXPERIMENT")) print("# {1:^{0}} #".format(_width-4, "")) print("# {1:<{0}} #".format(_width-4, "{0:{2}s}: {1:03d}".format("ID",id,max_k_width))) for k,v in args.items(): if type(v) is int: print("# {1:<{0}} #".format(_width-4,"{0:{2}s}: {1:0d}".format(k,v,max_k_width))) elif type(v) is float: print("# {1:<{0}} #".format(_width-4,"{0:{2}s}: {1:0.3f}".format(k,v,max_k_width))) else: print("# {1:<{0}} #".format(_width-4,"{0:{2}s}: {1:s}".format(k,str(v),max_k_width))) print("{}".format("#"*_width)) del args print("\n############ STARTING TRAINING ###########\n") try: with tqdm.tqdm(total=train_steps, leave=True) as bar: callback.set_bars(bar) model.learn( total_timesteps=train_steps, callback=callback, ) if not not_save: model.save(experiment_folder+"/weights_final") except KeyboardInterrupt: if not not_save and input("Do you want to DELETE this experiment? (Yes/n) ") == "Yes": remove_experiment(experiment_folder, folder, experiment_csv, id) else: if not not_save: model.save(experiment_folder+"/weights_final")
resume = False if __name__ == "__main__": # multiprocess environment # for now, it doens't make sense to have multiple environment n_cpu = 8 env = SubprocVecEnv([ lambda: SwimmerLocomotionEnv(path=fixed_path, random_path=use_random_path, use_hard_path=False, robot_link_length=robot_link_length) for i in range(n_cpu) ]) if resume: model = PPO2.load("model/reynolds_forward/reynolds_ppo_weight_99", env=env, gamma=gamma, verbose=1, tensorboard_log='./tf_logs/traj_follow/') else: model = PPO2(MlpPolicy, env, gamma=gamma, verbose=1, tensorboard_log='./tf_logs/reynolds_forward') for i in range(100): model.learn(total_timesteps=250000, reset_num_timesteps=False) model.save("model/reynolds_forward/reynolds_ppo_weight_" + str(i))
def run_matchup(drafter1: str, drafter2: str, battler: str, games: int, seed: int, concurrency: int) \ -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]: """ Run the match-up between `drafter1` and `drafter2` using `battler` battler :param drafter1: drafter to play as first player :param drafter2: drafter to play as second player :param battler: battler to simulate the matches :param games: amount of matches to simulate :param seed: seed used to generate the matches :param concurrency: amount of matches executed at the same time :return: a tuple containing (i) a tuple containing the win rate of the first and second players, (ii) a tuple containing the average mana curves of the first and second players, (iii) a tuple containing the `30 * games` individual draft choices of the first and second players; (iv) a tuple of 3-uples containing the card alternatives presented to the players at each of the `games` episodes; and (v) a tuple containing the `games` decks built by the first and second players. """ # parse the battle agent battler = agents.parse_battle_agent(battler) # initialize envs env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)] # wrap envs in a vectorized env env = DummyVecEnv(env) for i in range(concurrency): # no overlap between episodes at each process current_seed = seed + (games // concurrency) * i current_seed -= 1 # resetting the env increases the seed by 1 # set seed to env env.env_method('seed', current_seed, indices=[i]) # reset the env env.reset() # initialize first player if drafter1.endswith('zip'): current_drafter = agents.RLDraftAgent(PPO2.load(drafter1)) current_drafter.use_history = "history" in drafter1 else: current_drafter = agents.parse_draft_agent(drafter1)() current_drafter.seed(seed) current_drafter.name = drafter1 drafter1 = current_drafter # initialize second player if drafter2.endswith('zip'): other_drafter = agents.RLDraftAgent(PPO2.load(drafter2)) other_drafter.use_history = "history" in drafter2 else: other_drafter = agents.parse_draft_agent(drafter2)() other_drafter.seed(seed) other_drafter.name = drafter2 drafter2 = other_drafter # initialize metrics episodes_so_far = 0 episode_rewards = [[0.0] for _ in range(env.num_envs)] drafter1.mana_curve = [0 for _ in range(13)] drafter2.mana_curve = [0 for _ in range(13)] drafter1.choices = [[] for _ in range(env.num_envs)] drafter2.choices = [[] for _ in range(env.num_envs)] drafter1.decks = [[[]] for _ in range(env.num_envs)] drafter2.decks = [[[]] for _ in range(env.num_envs)] alternatives = [[] for _ in range(env.num_envs)] # run the episodes while True: observations = env.get_attr('state') # get the current agent's action for all concurrent envs if isinstance(current_drafter, agents.RLDraftAgent): all_past_choices = env.get_attr('choices') new_observations = [] for i, observation in enumerate(observations): new_observation = encode_state_draft( observation, use_history=current_drafter.use_history, past_choices=all_past_choices[i][observation.current_player.id] ) new_observations.append(new_observation) actions = current_drafter.act(new_observations) else: actions = [current_drafter.act(observation) for observation in observations] # log chosen cards into current agent's mana curve for i, (action, observation) in enumerate(zip(actions, observations)): # get chosen index try: chosen_index = action.origin except AttributeError: chosen_index = action # save choice current_drafter.choices[i].append(chosen_index) # get chosen card chosen_card = observation.current_player.hand[chosen_index] # increase amount of cards chosen with the chosen card's cost current_drafter.mana_curve[chosen_card.cost] += 1 # add chosen card to this episode's deck current_drafter.decks[i][-1].append(chosen_card.id) # save card alternatives if observation.current_player.id == PlayerOrder.FIRST: alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand))) # perform the action and get the outcome _, rewards, dones, _ = env.step(actions) if isinstance(current_drafter, agents.RLDraftAgent): current_drafter.dones = dones # update metrics for i in range(env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) current_drafter.decks[i].append([]) other_drafter.decks[i].append([]) episodes_so_far += 1 # check exiting condition if episodes_so_far >= games: break # swap drafters current_drafter, other_drafter = other_drafter, current_drafter # normalize mana curves total_choices = sum(drafter1.mana_curve) drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve] drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve] # join all parallel rewards all_rewards = [reward for rewards in episode_rewards for reward in rewards[:-1]] # join all parallel choices drafter1.choices = [c for choices in drafter1.choices for c in choices] drafter2.choices = [c for choices in drafter2.choices for c in choices] # join all parallel decks drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck] drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck] # join all parallel alternatives alternatives = [turn for env in alternatives for turn in env] # cap any unsolicited data from additional episodes all_rewards = all_rewards[:games] drafter1.choices = drafter1.choices[:30 * games] drafter2.choices = drafter2.choices[:30 * games] drafter1.decks = drafter1.decks[:games] drafter2.decks = drafter2.decks[:games] alternatives = alternatives[:30 * games] # convert the list of rewards to the first player's win rate win_rate = (mean(all_rewards) + 1) * 50 return (win_rate, 100 - win_rate), \ (drafter1.mana_curve, drafter2.mana_curve), \ (drafter1.choices, drafter2.choices), \ alternatives, \ (drafter1.decks, drafter2.decks), \ all_rewards
envparameters = f.read() envparameters = envparameters.strip('[') envparameters = envparameters.strip(']') f_list = [float(i) for i in envparameters.split(",")] print("envparameters: " + str(f_list)) my_step_limit = int(f_list[0]) my_step_size = float(f_list[1]) my_maxspeed = float(f_list[2]) # Initialize environment with signal parameters: env = CustomEnv(step_limit=my_step_limit, step_size=my_step_size, maxspeed=my_maxspeed) # 0.01745*5 # Load trained model and execute it forever: model = PPO2.load("../Models/" + filename) while True: #obs = env.reset() obs = env.reset() #obs = obs.reshape((1,4)) #print(env.observation_space.shape) #obs, rewards, dones, info = env.step([0,0]) for i in range(1000000): #my_step_limit action, _states = model.predict(obs) print(action) obs, rewards, dones, info = env.step(action) #obs = np.array(obs).reshape((1,4)) env.renderSlow(1000)
parser.add_argument('--name', default=None, required=True, help='Name of the model (required)') parser.add_argument( '--normalize', type=bool, default=False, help='Normalize the environement for training (default: False)') args = parser.parse_args() name_resume = args.name normalize = args.normalize commands = [[1, 0], [2, 0], [3, 0]] if name_resume != None: model = PPO2.load(workDirectory + "/resultats/" + name_resume + "/" + name_resume + ".zip") env = DummyVecEnv( [lambda: e.AidaBulletEnv( commands, render=False, on_rack=False, )]) if normalize: env = VecNormalize(env, clip_obs=1000.0, clip_reward=1000.0, training=False) env.load_running_average(workDirectory + "/resultats/" + name_resume + "/normalizeData")
env = MultiAgentSelectObservation(env, DISTRICTS_GROUP_IDS, maac=True) env = MultiAgentSelectAction(env, DISTRICTS_GROUP_IDS, 1) no_closures = [1] * n_weeks weekends = False (baseline_pd, baseline_ar, inf) = run_model(env.unwrapped._model, n_weeks, weekends, args.district_name, no_closures) ppo_paths = [] for p in args.paths: ppo_paths.append(p) models = {} # Order of paths and DISTRICTS_GROUP are assumed to be the same. for district_name, p in zip(DISTRICTS_GROUP, ppo_paths): model = PPO2.load(p / "params.zip") models[district_name] = model # Check that we have all the districts if set(models.keys()) != set(DISTRICTS_GROUP): print("set(models.keys())" + str(set(models.keys()))) print("set(DISTRICTS_GROUP)" + str(set(DISTRICTS_GROUP))) assert set(models.keys()) == set(DISTRICTS_GROUP) print("ar-improvement") for run in range(args.runs): attack_rate = evaluate(env, models, DISTRICTS_GROUP_IDS, n_weeks) print(baseline_ar - attack_rate) env.close()
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model_path", required=True, help="model path") parser.add_argument("--time_steps", required=True, help="time steps") args = vars(parser.parse_args()) time_steps = int(args["time_steps"]) model_path = str(args["model_path"]) policy_path = os.path.join( model_path, "model_" + str(time_steps) + "_steps" ) model = PPO2.load(policy_path) # define a method for the policy fn of your trained model def policy_fn(obs): return model.predict(obs, deterministic=True)[0] # we create the same env as we used for training in train_pushing_ppo.py, # such that action and observation space remain coherent with the policy. # however, unlike during the training, we set the initialization to the the # same as in the standard CubeEnv, since this is what the policy will be # evaluated on eventually. initializer = cube_env.RandomInitializer(difficulty=2) # difficulty one means pushing env = ExamplePushingTrainingEnv(initializer=initializer, frameskip=3, visualization=True) env = FlatObservationWrapper(env)
from stable_baselines import PPO2 import argparse import numpy as np if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('model_zip') args = parser.parse_args() env = gym.make('simpleEnv:simpleEnv-v0') vec_env = DummyVecEnv([lambda: env]) # model =PPO2.load('simpleEnv-full5x5', vec_env, verbose=0, tensorboard_log='learning-ppo') model = PPO2.load(args.model_zip, vec_env, verbose=0, tensorboard_log='learning-ppo') obs = env.reset() rewards = [] for i in range(1000): env.render(rewards=rewards) if i == 20: env.reference_trajectory = np.random.normal( 1, 1, env.obs_dimension) action, _ = model.predict(obs) s, r, _, _ = env.step(action) rewards.append(r)
color = "#56EEF4" headType = 'silly' tailType = 'sharp' return start_response(color, headType, tailType) BOARD_WIDTH = 11 BOARD_HEIGHT = 11 NUM_ENVS = 1 NUM_LAYERS = 6 LAYER_WIDTH = 39 LAYER_HEIGHT = 39 model = PPO2.load('/snake/model.pkl') def prepareObservations(you, snakes, food, orientation): head = you['body'][0] hx = head['x'] hy = head['y'] yourLength = len(you['body']) observations = [0] * NUM_ENVS * LAYER_HEIGHT * LAYER_WIDTH * NUM_LAYERS def assign(point, layer, value): x = point['x'] y = point['y'] x = (x - hx) * (-1 if orientation & 1 != 0 else 1) y = (x - hy) * (-1 if orientation & 2 != 0 else 1) x += LAYER_WIDTH / 2 y += LAYER_HEIGHT / 2
import time from stable_baselines import PPO2 from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.gail import ExpertDataset, generate_expert_traj from baselines.common.atari_wrappers import * env = gym.make('BowlingNoFrameskip-v0') env = MaxAndSkipEnv(env, skip=4) env = WarpFrame(env) env = DummyVecEnv([lambda: env]) dataset = ExpertDataset(expert_path='bowling_demo.npz', verbose=1) #model = PPO2('CnnPolicy', env, verbose=1) model = PPO2.load('bowling_model', env=env) #model.pretrain(dataset, n_epochs=1000) model.learn(total_timesteps=256000) model.save('bowling_model') state = env.reset() total_reward = 0 while True: env.render() time.sleep(1 / 60) action, _ = model.predict(state)
def main(): args = arg_parser() # Create log dir tensorboard_log_dir = "./tensorboard_log/" os.makedirs(tensorboard_log_dir,exist_ok=True) # Create result tmp dir figdir = "./fig/" os.makedirs(figdir,exist_ok=True) # Create ndarray save dir nd_dir = "./data_each_term_of_rewardfunction_100episodes/" + str(args.agent) + "/" os.makedirs(nd_dir, exist_ok=True) # Create and wrap the environment env1 = gym.make(config['env']) broken_env = ChangeJointRangeEnv(env1) #env1 = NormalEnv(env1) # reward custom if args.video: broken_env = wrappers.Monitor(broken_env,'./videos/' + args.loaddir + "-" + datetime.datetime.now().isoformat(),force=True,video_callable=(lambda ep: ep % 1 == 0)) # for output video # broken_env = DummyVecEnv([lambda :broken_env]) #複数の環境用の単純なベクトル化されたラッパーを作成し、現在のPythonプロセスで各環境を順番に呼び出します。 env1 = DummyVecEnv([lambda : env1]) # argpaserから入力する場合 agentName = [] agentName.append(args.agent) plainData = [] brokenData = [] perror = [] berror = [] plt.figure() sns.set() # fig,ax = plt.subplots() for agent in agentName: brokenSeedAveReward = [] if "Curriculum" in agent: # ロードディレクトリの指定 load_dir = "./ISIS2020/trained_Curriculum/" + agent +"/" else: # ロードディレクトリの指定 load_dir = "./ISIS2020/trained_agent_dir/" + agent +"/" # seedごとに平均報酬を獲得する ,range(1,6) for seed in range(1,6): if "range09-16million" in agentName and seed >= 4: continue # PPO2modelの生成(トレーニングを行うエージェントの作成) trainedAnt = PPO2(MlpPolicy,env1,verbose=1,tensorboard_log=tensorboard_log_dir) # 保存したモデル(学習済みモデル)のload :zipファイルのファイル名のみとパスを指定,seedごとに trainedAnt = PPO2.load(load_dir + "trainedAnt" + "-seed" + str(seed)) # seedの設定 trainedAnt.set_random_seed(seed+100) print("loaddir:",load_dir + "trainedAnt" + "-seed" + str(seed)) broken_obs = broken_env.reset() broken_total_rewards = [] rewards = 0 forwards = 0 ctrls = 0 contacts = 0 survives = 0 # kを0から1まで,0.01刻みで変化させる for k in tqdm(range(0, 100)): # 故障が起きる環境でのrewardを求めるループ(100) for episode in range(100): # iteration of time steps, default is 1000 time steps for i in range(1000): # predict phase action, _states = trainedAnt.predict(broken_obs) # step phase # broken環境で評価する時 broken_obs, reward, done, info = broken_env.step(action, k) rewards += reward forwards += info['reward_forward'] ctrls += info['reward_ctrl'] contacts += info['reward_contact'] survives += info['reward_survive'] if done: break # k_geneにkとその時の報酬を格納 k_gene[seed-1][episode][k] = rewards # 報酬関数の各項の値を格納 reward_forward_map[seed-1][episode][k] = forwards reward_ctrl_map[seed-1][episode][k] = ctrls reward_contact_map[seed-1][episode][k] = contacts reward_survive_map[seed-1][episode][k] = survives # 環境をリセット broken_obs = broken_env.reset() # 合計報酬の記録とリセット broken_total_rewards.append(rewards) rewards = 0 forwards = 0 ctrls = 0 contacts = 0 survives = 0 broken_reward_average1 = sum(broken_total_rewards)/len(broken_total_rewards) brokenSeedAveReward.append(broken_reward_average1) del trainedAnt # agentのplain,broken環境での平均報酬が格納されている broken_ave = sum(brokenSeedAveReward)/len(brokenSeedAveReward) brokenData.append(broken_ave) broken_error = np.std(brokenSeedAveReward,ddof=1)/np.sqrt(len(brokenSeedAveReward)) berror.append(broken_error) brokenData = np.array(brokenData).flatten() berror = np.array(berror) # print(k_gene) for seed in range(1, 6): seed_gene = k_gene[seed-1,:,:] seed_gene = np.sum(seed_gene, axis=0) seed_gene = seed_gene/100 # 平均報酬 np.save(nd_dir + str(agentName[0]) + "_rewardForEachK" + "_seed=" + str(seed), seed_gene) # 報酬関数の各項の二次元配列を一次元配列に変形してnpyで保存 save_reward_map(reward_forward_map, nd_dir, str(agentName[0]), "_rewardForward") save_reward_map(reward_ctrl_map, nd_dir, str(agentName[0]), "_rewardCtrl") save_reward_map(reward_contact_map, nd_dir, str(agentName[0]), "_rewardContact") save_reward_map(reward_survive_map, nd_dir, str(agentName[0]), "_rewardSurvive")
import gym import pandas as pd from qtrade_env import QtradeEnv root_dir = '/Users/liuyehong/Dropbox/CICC/Algorithm_Trading/Platform2/OHLC/data/1Min/' import pickle from stable_baselines.common.policies import MlpPolicy, CnnPolicy, LstmPolicy, CnnLstmPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: QtradeEnv()]) model = PPO2(MlpLnLstmPolicy, env, verbose=1, nminibatches=1) model.learn(total_timesteps=50000) model.save('ppo2_mlplnlstm') del model model = PPO2.load('ppo2_mlplnlstm', env=env) obs = env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
from stable_baselines import TRPO from stable_baselines import PPO2 from snake_env.reynolds_swimmer_forward_vel import SwimmerLocomotionEnv import numpy as np fixed_path = [(-0.2 * i, 0) for i in range(30)] use_random_path = True robot_k = 1.0 robot_link_length = 0.3 model = PPO2.load("model/reynolds_forward/reynolds_ppo_weight_24") env = SwimmerLocomotionEnv(path=fixed_path, random_path=use_random_path, use_hard_path=False, robot_link_length=robot_link_length, robot_k=robot_k, record_trajectory=True) obs = env.reset() total_reward = 0 x_list = [] for i in range(10000): action, _states = model.predict(obs) #step_time = 0.5 #action = [-0.8*np.sin(step_time*i), 0.8*np.cos(step_time*i)] # print("start of step") #print(action) x_list.append(action[1]) obs, rewards, dones, info = env.step(action) # print(obs)
def __init__(self, obs_shape, action_space, base=None, base_kwargs=None, load_expert=None, env_name=None, rl_baseline_zoo_dir=None, expert_algo=None, normalize=True): super(Policy, self).__init__() #TODO: Pass these parameters in self.epsilon = 0.1 self.dril = True if base_kwargs is None: base_kwargs = {} if base is None: if env_name in ['duckietown']: base = DuckieTownCNN elif len(obs_shape) == 3: print('CNN base check passed') base = CNNBase elif len(obs_shape) == 1: base = MLPBase else: raise NotImplementedError self.base = base(obs_shape[0], normalize=normalize, **base_kwargs) self.action_space = None if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(self.base.output_size, num_outputs) self.action_space = "Discrete" elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(self.base.output_size, num_outputs) self.action_space = "Box" elif action_space.__class__.__name__ == "MultiBinary": raise Exception('Error') else: raise NotImplementedError if load_expert == True and env_name not in [ 'duckietown', 'highway-v0' ]: print('[Loading Expert --- Base]') model_path = os.path.join(rl_baseline_zoo_dir, 'trained_agents', f'{expert_algo}') try: import mpi4py from stable_baselines import TRPO except ImportError: mpi4py = None DDPG, TRPO = None, None from stable_baselines import PPO2 model_path = f'{model_path}/{env_name}.pkl' if env_name in ['AntBulletEnv-v0']: baselines_model = TRPO.load(model_path) else: baselines_model = PPO2.load(model_path) for key, value in baselines_model.get_parameters().items(): print(key, value.shape) if base.__name__ == 'CNNBase': print(['Loading CNNBase expert model']) params = copy_cnn_weights(baselines_model) elif load_expert == True and base.__name__ == 'MLPBase': print(['Loading MLPBase expert model']) params = copy_mlp_weights(baselines_model) #TODO: I am not sure what this is doing try: self.load_state_dict(params) self.obs_shape = obs_shape[0] except: self.base = base(obs_shape[0] + 1, **base_kwargs) self.load_state_dict(params) self.obs_shape = obs_shape[0] + 1
if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: NegativeRewardEnv()]) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=25000, tb_log_name='PPO2' + model_tag) model.save(model_folder + "PPO2" + model_tag) del model model = PPO2.load(model_folder + "PPO2" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
import gym import time env = gym.make('InvertedPendulum-v0') from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import PPO2 if __name__ == '__main__': # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: env for i in range(n_cpu)]) model = PPO2.load("PPO2_cartpole_tensorboard/ppo2_cartpole_5") print(env) # Enjoy trained agent obs = env.reset() cumul = 0 # Passing state=None to the predict function means # it is the initial state state = None # When using VecEnv, done is a vector done = [False for _ in range(env.num_envs)] while True: a = time.time() action, states = model.predict(obs, state=state, mask=done) # print(time.time()-a) obs, rewards, dones, info = env.step(action) cumul = rewards[0] + cumul
if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv(map_name='map1')]) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=2500000, tb_log_name='PPO2_map1' + model_tag) model.save(model_folder + "PPO2_map1" + model_tag) del model model = PPO2.load(model_folder + "PPO2_map1" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'], 'learning_rate': params['learning_rate'], 'ent_coef': params['ent_coef'], 'cliprange': params['cliprange'], 'noptepochs': int(params['noptepochs']), 'lam': params['lam'], } if curr_idx == -1: model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1, tensorboard_log="./tensorboard", **model_params) else: model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '.pkl', env=train_env) for idx in range(curr_idx + 1, 10): print('[', idx, '] Training for: ', train_len, ' time steps') model.learn(total_timesteps=train_len) obs = test_env.reset() done, reward_sum = False, 0 while not done: action, _states = model.predict(obs) obs, reward, done, info = test_env.step(action) reward_sum += reward print('[', idx, '] Total reward: ', reward_sum, ' (' + reward_strategy + ')')
from stable_baselines import PPO2 def env_create(): env = ClientDapr("ActorUnity") env.create("CartPole-v1") print(f"[Client] Created Actor {env.actor_id}", flush=True) return env print("===============================================") print("INFERING") print("===============================================") model = PPO2.load("baselines_ppo_cartpole") env_local = env_create() # Start monitoring print("[Client] Starting to monitor", flush=True) env_local.monitor_start(1) # Run Experiment obs = env_local.reset() is_done = False while not (is_done): action, _states = model.predict(obs) obs, rewards, is_done, info = env_local.step(action) # Stop Monitoring
if mi == 0: # base policy model = PPO2('MlpPolicy', env, tensorboard_log='./' + model_names[mi] + '_tb/') print('Learning Base PPO2 model:', model_names[mi]) # learning model.learn(total_timesteps=BASE_TRAIN_STEPS, tb_log_name=model_names[mi]) model.save(model_names[mi]) else: print('Learning PPO2 model:', model_names[mi]) model = PPO2.load(model_names[0], env=env, tensorboard_log='./' + model_names[0] + '_tb/') model.learn(total_timesteps=TOTAL_TRAIN_STEPS - BASE_TRAIN_STEPS, tb_log_name=model_names[mi], reset_num_timesteps=False) model.save(model_names[mi]) total_rewards = 0. #-------- run the model -------# for e in range(NUM_EPISODES): obs = env.reset() # env.reset() epi_rewards = 0. for i in range(1000):
def cb(a, b): global last_time t = datetime.now().timestamp() if t - last_time > 60: last_time = t print("SAVING===" * 10) model.save(net_name) # multiprocess environment env = make_vec_env('MinitaurBulletEnv-v0', n_envs=4) # env = gym.make('MinitaurBulletEnv-v0', render=True) try: model = PPO2.load( net_name, policy_kwargs=policy_kwargs, env=env ) except ValueError: model = PPO2( MlpPolicy, env, verbose=1, tensorboard_log='./tensorboard', ) while True: model.learn( total_timesteps=2000000, callback=cb, tb_log_name=net_name
# This is an example of a single-episode rollout, using the QVM trained agent # on the MaxCut test set. import os import gym import gym_forest from stable_baselines import PPO2 MODEL_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'qvm.p') ENV_NAME = 'forest-maxcut-test-v0' MAX_STEPS = 25 env = gym.make(ENV_NAME) agent = PPO2.load(MODEL_FILE) obs = env.reset() best_reward = 0 eps_reward = 0 for i in range(MAX_STEPS): action, _ = agent.predict(obs) obs, reward, done, info = env.step(action) eps_reward += reward if done: # early termination returns the remaining episode reward, # assuming that we do just as well on the remaining steps # here we get the corresponding single-step reward single_step_reward = reward / (MAX_STEPS - i) print('[{}]\t {}\t reward {:.3f}'.format(i, info['instr'], single_step_reward))
:param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = T4HistoryEnv(dir) env.seed(seed + rank) return env set_global_seeds(seed) return _init dir = '/home/dan/serpent/market_history/sp500' # env = T4HistoryEnv(dir, continuous_action=True) # env2 = T4HistoryEnv(dir) # file_count = len(env.files) # print(len(env.files)) # env = DummyVecEnv([lambda: env]) # env = Monitor(env, log_dir, allow_early_resets=True) # env2 = Monitor(env2, log_dir, allow_early_resets=True)\ param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # env = DummyVecEnv([lambda: env]) env = SubprocVecEnv([make_env('', i) for i in range(8)]) # model = PPO2('CnnPolicy', env, verbose=1, tensorboard_log="./tmp/gym/board/") model = PPO2.load('sp500_ppo2_pretrain') model.set_env(env) model.learn(total_timesteps=int(10e6), log_interval=10, callback=callback) model.save('sp500_ppo2_pretrain')
import argparse import numpy as np from stable_baselines import PPO2, logger from stable_baselines.common.cmd_util import make_atari_env parser = argparse.ArgumentParser() parser.add_argument('expert', type=str, help='Expert path (*.zip)') parser.add_argument('--seed', type=int, default=0, help='Random seed for env.') parser.add_argument('--note', type=str, default='test', help='Logging directory') parser.add_argument('--env', type=str, default='PongNoFrameskip-v4', help='Environment ID') args = parser.parse_args() logdir = os.path.join('logs', args.env, args.note) logger.configure(logdir) logger.info(args) env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) model = PPO2.load(args.expert) generate_expert_traj(model, save_path=os.path.join(logdir, 'expert'), env=env)
def main(args): envconfig_string = args.envconfig custom_envconfig = _preprocess_custom_envconfig( args.envconfig) if args.envconfig is not None else {} env_id = 'gym_auv:' + args.env env_name = env_id.split(':')[-1] if ':' in env_id else env_id envconfig = gym_auv.SCENARIOS[env_name][ 'config'] if env_name in gym_auv.SCENARIOS else {} envconfig.update(custom_envconfig) NUM_CPU = 8 EXPERIMENT_ID = str(int(time())) + args.algo.lower() model = { 'ppo': PPO2, 'ddpg': DDPG, 'td3': TD3, 'a2c': A2C, 'acer': ACER, 'acktr': ACKTR }[args.algo.lower()] if args.mode == 'play': agent = model.load(args.agent) if args.agent is not None else None envconfig_play = envconfig.copy() envconfig_play['show_indicators'] = True #envconfig_play['autocamera3d'] = False env = create_env(env_id, envconfig_play, test_mode=True, render_mode=args.render, pilot=args.pilot, verbose=True) print('Created environment instance') if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, args.video_dir, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) print(args.video_dir, args.video_name) play_scenario(env, recorded_env, args, agent=agent) recorded_env.env.close() elif (args.mode == 'enjoy'): agent = model.load(args.agent) # params = agent.get_parameters() # policy_weights = [ # params['model/pi_fc0/w:0'], # params['model/pi_fc1/w:0'], # params['model/pi/w:0'] # ] # policy_biases = [ # params['model/pi_fc0/b:0'], # params['model/pi_fc1/b:0'], # params['model/pi/b:0'] # ] # for param in params: # print(param, params[param].shape) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) os.makedirs(video_folder, exist_ok=True) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render, pilot=args.pilot) if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, video_folder, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) obs = recorded_env.reset() state = None done = [False for _ in range(vec_env.num_envs)] for t_step in range(args.recording_length): if args.recurrent: action, _states = agent.predict( observation=obs, state=state, mask=done, deterministic=not args.stochastic) state = _states else: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = recorded_env.step(action) recorded_env.render() if args.env == 'PathGeneration-v0': sleep(1) recorded_env.env.close() elif (args.mode == 'train'): figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) scenario_folder = os.path.join(figure_folder, 'scenarios') os.makedirs(scenario_folder, exist_ok=True) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) recording_length = 8000 os.makedirs(video_folder, exist_ok=True) agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env, EXPERIMENT_ID) os.makedirs(agent_folder, exist_ok=True) tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard', args.env, EXPERIMENT_ID) tensorboard_port = 6006 if (args.nomp or model == DDPG or model == TD3): num_cpu = 1 vec_env = DummyVecEnv( [lambda: create_env(env_id, envconfig, pilot=args.pilot)]) else: num_cpu = NUM_CPU vec_env = SubprocVecEnv([ make_mp_env(env_id, i, envconfig, pilot=args.pilot) for i in range(num_cpu) ]) if (args.agent is not None): agent = model.load(args.agent) agent.set_env(vec_env) else: if (model == PPO2): if args.recurrent: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 1, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-3, } class CustomLSTMPolicy(MlpLstmPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[ 256, 256, 'lstm', dict(vf=[64], pi=[64]) ], **_kwargs) agent = PPO2(CustomLSTMPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) else: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 32, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-4, } #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64]) #policy_kwargs = dict(net_arch=[64, 64, 64]) #layers = [256, 128, 64] layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = PPO2(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) elif (model == DDPG): hyperparams = { 'memory_limit': 1000000, 'normalize_observations': True, 'normalize_returns': False, 'gamma': 0.98, 'actor_lr': 0.00156, 'critic_lr': 0.00156, 'batch_size': 256, 'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.287, desired_action_stddev=0.287) } agent = DDPG(LnMlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) elif (model == TD3): action_noise = NormalActionNoise(mean=np.zeros(2), sigma=0.1 * np.ones(2)) agent = TD3(stable_baselines.td3.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, action_noise=action_noise) elif model == A2C: hyperparams = { 'n_steps': 5, 'gamma': 0.995, 'ent_coef': 0.00001, 'learning_rate': 2e-4, } layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = A2C(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) elif model == ACER: agent = ACER(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == ACKTR: agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) print('Training {} agent on "{}"'.format(args.algo.upper(), env_id)) n_updates = 0 n_episodes = 0 def callback(_locals, _globals): nonlocal n_updates nonlocal n_episodes sys.stdout.write('Training update: {}\r'.format(n_updates)) sys.stdout.flush() _self = _locals['self'] vec_env = _self.get_env() class Struct(object): pass report_env = Struct() report_env.history = [] report_env.config = envconfig report_env.nsensors = report_env.config[ "n_sensors_per_sector"] * report_env.config["n_sectors"] report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1) report_env.last_episode = vec_env.get_attr('last_episode')[0] report_env.config = vec_env.get_attr('config')[0] report_env.obstacles = vec_env.get_attr('obstacles')[0] env_histories = vec_env.get_attr('history') for episode in range(max(map(len, env_histories))): for env_idx in range(len(env_histories)): if (episode < len(env_histories[env_idx])): report_env.history.append( env_histories[env_idx][episode]) report_env.episode = len(report_env.history) + 1 total_t_steps = _self.get_env().get_attr( 'total_t_steps')[0] * num_cpu agent_filepath = os.path.join(agent_folder, str(total_t_steps) + '.pkl') if model == PPO2: recording_criteria = n_updates % 70 == 0 report_criteria = True _self.save(agent_filepath) elif model == A2C or model == ACER or model == ACKTR: save_criteria = n_updates % 100 == 0 recording_criteria = n_updates % 1000 == 0 report_criteria = True if save_criteria: _self.save(agent_filepath) elif model == DDPG or model == TD3: save_criteria = n_updates % 10000 == 0 recording_criteria = n_updates % 50000 == 0 report_criteria = report_env.episode > n_episodes if save_criteria: _self.save(agent_filepath) if report_env.last_episode is not None and len( report_env.history) > 0 and report_criteria: try: #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode)) gym_auv.reporting.report(report_env, report_dir=figure_folder) #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode))) except OSError as e: print("Ignoring reporting OSError:") print(repr(e)) if recording_criteria: if args.pilot: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, args.pilot, envconfig_string, ' --recurrent' if args.recurrent else '') else: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, envconfig_string, ' --recurrent' if args.recurrent else '') subprocess.Popen(cmd) n_episodes = report_env.episode n_updates += 1 agent.learn( total_timesteps=1500000000000000000000000000000000000000000, tb_log_name='log', callback=callback) elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']): figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) agent = PPO2.load(args.agent) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) print('Running {} test for {}...'.format( args.mode, valuedict_str)) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) else: env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) with open(os.path.join(figure_folder, 'config.json'), 'w') as f: json.dump(env.config, f) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field(env, agent, fig_dir=figure_folder) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines(env, agent, fig_dir=figure_folder) print('Output folder: ', figure_folder) elif args.mode == 'test': figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env, EXPERIMENT_ID) scenario_folder = os.path.join(figure_folder, 'scenarios') video_folder = os.path.join(figure_folder, 'videos') os.makedirs(figure_folder, exist_ok=True) os.makedirs(scenario_folder, exist_ok=True) os.makedirs(video_folder, exist_ok=True) if not args.onlyplot: agent = model.load(args.agent) def create_test_env(video_name_prefix, envconfig=envconfig): print('Creating test environment: ' + env_id) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render if args.video else None, pilot=args.pilot) vec_env = DummyVecEnv([lambda: env]) if args.video: video_length = min(500, args.recording_length) recorded_env = VecVideoRecorder(vec_env, video_folder, record_video_trigger=lambda x: (x % video_length) == 0, video_length=video_length, name_prefix=video_name_prefix) active_env = recorded_env if args.video else vec_env return env, active_env failed_tests = [] def run_test(id, reset=True, report_dir=figure_folder, scenario=None, max_t_steps=None, env=None, active_env=None): nonlocal failed_tests if env is None or active_env is None: env, active_env = create_test_env(video_name_prefix=args.env + '_' + id) if scenario is not None: obs = active_env.reset() env.load(args.scenario) print('Loaded', args.scenario) else: if reset: obs = active_env.reset() else: obs = env.observe() gym_auv.reporting.plot_scenario(env, fig_dir=scenario_folder, fig_postfix=id, show=args.onlyplot) if args.onlyplot: return cumulative_reward = 0 t_steps = 0 if max_t_steps is None: done = False else: done = t_steps > max_t_steps while not done: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = active_env.step(action) if args.video: active_env.render() t_steps += 1 cumulative_reward += reward[0] report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format( id, t_steps, cumulative_reward, info[0]['progress']) sys.stdout.write(report_msg) sys.stdout.flush() if args.save_snapshots and t_steps % 30 == 0 and not done: env.save_latest_episode(save_history=False) for size in (20, 50, 100, 200, 300, 400, 500): gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=(args.env + '_t_step_' + str(t_steps) + '_' + str(size) + '_' + id), local=True, size=size) elif done: gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) env.close() gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1) #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) #env.save(os.path.join(scenario_folder, id)) if env.collision: failed_tests.append(id) with open(os.path.join(figure_folder, 'failures.txt'), 'w') as f: f.write(', '.join(map(str, failed_tests))) return copy.deepcopy(env.last_episode) print('Testing scenario "{}" for {} episodes.\n '.format( args.env, args.episodes)) report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format( 'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions', 'CT-Error [m]', 'H-Error [deg]') print(report_msg_header) print('-' * len(report_msg_header)) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) if args.scenario: if args.testvals: episode_dict = {} for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = -np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict[valuedict_str] = [last_episode, colorval] print('Plotting all') gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: run_test("ep0", reset=True, scenario=args.scenario) else: if args.testvals: episode_dict = {} agent_index = 1 for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict['Agent ' + str(agent_index)] = [last_episode, colorval] agent_index += 1 gym_auv.reporting.plot_trajectory(env, fig_dir=figure_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: env, active_env = create_test_env(video_name_prefix=args.env) for episode in range(args.episodes): run_test('ep' + str(episode), env=env, active_env=active_env) if args.video and active_env: active_env.close()