def train_agent_with_a2c(load=False): from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import A2C # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)]) env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: model = A2C(env=env, verbose=1, policy=CustomPolicy) # model.learn(total_timesteps=1000000) ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) else: model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env) with model.graph.as_default(): for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): print(i) return model
def train(game, num_timesteps, num_envs, dir_name, model_name, prev_model_name): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) log_dir = f"logs/{dir_name}/{model_name}-training" model_dir = f"models/{dir_name}" os.makedirs(log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_envs(game, False, num_envs) prev_model_path = f"{model_dir}/{prev_model_name}.zip" if prev_model_name is not None and os.path.exists(prev_model_path): model = A2C.load(prev_model_path, env=env) model.tensorboard_log = log_dir else: model = A2C(policy="MlpPolicy", env=env, gamma=0.8, n_steps=64, learning_rate=0.00025, verbose=1, tensorboard_log=log_dir) model.learn(num_timesteps) model.save(f"{model_dir}/{model_name}.zip") env.close()
def train_a2c(seed): """ test A2C on the uav_env(cartesian,discrete) :param seed: (int) random seed for A2C """ """ A2C(policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99, epsilon=1e-05, lr_schedule='linear', verbose=0,tensorboard_log=None, _init_setup_model=True) """ algo = 'A2C' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = A2C(policy=MlpPolicy, env=env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99, epsilon=1e-05, lr_schedule='linear', verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = A2C.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def _train(env_id, agent, model_params, total_steps, is_evaluation=False): if is_evaluation: # evaluate_policy() must only take one environment envs = SubprocVecEnv([make_env(env_id)]) else: envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)]) envs = VecNormalize( envs) # normalize the envs during training and evaluation # Load pretrained model during training. if not is_evaluation and os.path.exists(agent + '_' + env_id): if agent == 'ppo2': model = PPO2.load(agent + '_' + env_id) elif agent == 'a2c': model = A2C.load(agent + '_' + env_id) else: if agent == 'ppo2': model = PPO2(MlpLstmPolicy, envs, nminibatches=1, verbose=1, **model_params) elif agent == 'a2c': model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params) model.learn(total_timesteps=total_steps) return envs, model
def attention_render(model_name, env_name, num_cpu, log_dir): if not os.path.exists(log_dir): raise ('log_dir not Exists') env_id = env_name + 'NoFrameskip-v4' env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)]) # env = Monitor(env, log_dir, allow_early_resets=True) if model_name == 'A2C_Attention': model = A2C(AttentionPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention2': model = A2C(Attention2Policy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C': model = A2C(LstmPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') else: model = None model = model.load(log_dir + model_name + '_' + env_name, env=env) obs = env.reset() # print(env.observation_space) # cv2.imshow('test', RGB2BGR(obs[0])) # cv2.waitKey(0) while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) attentions = model.get_attention(obs, _states, done)[0] attentions_img = [] # print('attention', np.array(attention).shape) for i, attention in enumerate(attentions): attention = np.array(attention) attention = np.reshape(attention, [ env.observation_space.shape[0] // 10, env.observation_space.shape[1] // 10, 1 ]) attention = np.repeat(attention, [10] * attention.shape[0], axis=0) attention = np.repeat(attention, [10] * attention.shape[1], axis=1) attention = attention * 255 attentions_img.append(attention) # print(np.sum(attention)) attentions = tile_images(attentions_img) cv2.imshow('attention', attentions) cv2.waitKey(1) # break env.render() return model
def train_A2C(start_time_tests = [31*24*3600, 304*24*3600], episode_length_test = 14*24*3600, load = False): '''Method to train (or load a pre-trained) A2C agent. Testing periods have to be introduced already here to not use these during training. Parameters ---------- start_time_tests : list of integers Time in seconds from the beginning of the year that will be used for testing. These periods should be excluded in the training process. By default the first day of February and the first day of November are used. episode_length_test : integer Number of seconds indicating the length of the testing periods. By default two weeks are reserved for testing. load : boolean Boolean indicating whether the algorithm is loaded (True) or needs to be trained (False) ''' excluding_periods = [] for start_time_test in start_time_tests: excluding_periods.append((start_time_test,start_time_test+episode_length_test)) # Summer period (from June 21st till September 22nd). # Excluded since no heating during this period (nothing to learn). excluding_periods.append((173*24*3600, 266*24*3600)) env = BoptestGymEnvRewardWeightCost(url = url, actions = ['oveHeaPumY_u'], observations = {'reaTZon_y':(280.,310.)}, random_start_time = True, excluding_periods = excluding_periods, max_episode_length = 1*24*3600, warmup_period = 3*3600, Ts = 900) env = NormalizedObservationWrapper(env) env = NormalizedActionWrapper(env) model = A2C('MlpPolicy', env, verbose=1, gamma=0.99, seed=seed, tensorboard_log=os.path.join('results')) if not load: model.learn(total_timesteps=int(1e5)) # Save the agent model = A2C.load(os.path.join(utilities.get_root_path(), 'examples', 'agents', 'a2c_bestest_hydronic_heatpump')) else: # Load the trained agent model = A2C.load(os.path.join(utilities.get_root_path(), 'examples', 'agents', 'a2c_bestest_hydronic_heatpump')) return env, model, start_time_tests
def build_model(self): if self.is_stack: if self.game_type == "box": self.env = DummyVecEnv([lambda: self.env]) self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr) if self.game_type == "atari": self.model = A2C(CnnPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr) else: if self.game_type == "box": self.env = DummyVecEnv([lambda: self.env]) self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr) if self.game_type == "atari": self.model = A2C(CnnLstmPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
def run_baseline(params, LOAD_POLICY, VARIABLE_EVAL): # Evaluate the agent env = env_fun(animate=params["animate"], max_steps=params["max_steps"], action_input=False, latent_input=False, is_variable=VARIABLE_EVAL) policy = A2C('MlpPolicy', env) if LOAD_POLICY: policy_dir = "agents/xxx.zip" policy = A2C.load(policy_dir) # 2Q5 regressor = PyTorchMlpCst(env.obs_dim + env.act_dim, 24, env.obs_dim) return evaluate_model(params, env, policy, regressor)
def get_model(model_name, env, log_dir): if model_name == "A2C_DualAttention": model = A2C(DualAttentionLstmPolicy, env, verbose=1) elif model_name == "A2C_SelfAttention_Cin": model = A2C(SelfAttentionCinLstmPolicy, env, verbose=1) elif model_name == "A2C_SelfAttention": model = A2C(SelfAttentionLstmPolicy, env, verbose=1) elif model_name == 'A2C_Attention': model = A2C(AttentionPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention2': model = A2C(Attention2Policy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention3': model = A2C(Attention3Policy, env, verbose=1) elif model_name == 'A2C_Attention4': model = A2C(Attention4Policy, env, verbose=1) elif model_name == 'A2C': model = A2C(CnnLstmPolicy, env, verbose=1) else: raise ('{} Not Exist'.format(model_name)) return model
def load_a2c_model(env, learning_rate, batch_size, algorithm): from stable_baselines.common.policies import MlpPolicy model = None existing_pickle_files = get_files_with_pattern(pickle_dir, 'ppo2_recent_model.pkl') for file_name in existing_pickle_files: search = re.search('ppo2_recent_model.pkl', file_name) if search: model = A2C.load(file_name, env=env, verbose=0, tensorboard_log=log_dir) logger.info("Loading existing pickle file for environment {} with algorithm {} and policy '{}'.".format(env, algorithm, model.policy)) return model logger.debug("No pickle was found for environment {}. Creating new model with algorithm {} and policy 'MlpPolicy'...".format(env, algorithm)) model = A2C(policy='MlpPolicy', env=env, verbose=0, tensorboard_log=log_dir, learning_rate=learning_rate, n_steps = batch_size) return model
def train(): """Trains an A2C policy """ env = create_env() model = A2C( policy = CnnPolicy, env = env, gamma = 0.99, n_steps = 5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, learning_rate=7e-4, alpha=0.99, epsilon=1e-05, lr_schedule='constant', verbose=1, tensorboard_log="./tb" ) model.learn( total_timesteps=int(1e7), callback=callback, tb_log_name="a2c" ) model.save("models/pacman_a2c.pkl")
def load_model(tickers): '''Load in the pretrained model from the trained models folder ''' # model = run_model(tickers,start="2020-01-01T09:30:00-04:00", end="2020-12-31T09:30:00-04:00") model = A2C.load( "trained_models/2021-03-22 18:25:09.528982/A2C_30k_dow_120.zip") return model
def run_agent(envs, parameters): '''Train an agent.''' alg = parameters['alg'] learning_rate = parameters['learning_rate'] gamma = parameters['gamma'] model_path = parameters['model_path'] set_global_seeds(parameters.get('seed')) dummy_env = OptVecEnv(envs) if alg == 'PPO': model = PPO2(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1, nminibatches=dummy_env.num_envs) elif alg == 'A2C': model = A2C(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1) else: model = DDPG(ddpg.MlpPolicy, dummy_env, gamma=gamma, verbose=1, actor_lr=learning_rate / 10, critic_lr=learning_rate) try: model.learn(total_timesteps=parameters.get('total_timesteps', 10**6)) except tf.errors.InvalidArgumentError: LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma))) finally: dummy_env.close() model.save(str(model_path))
def define_model(env, log_dir): if DEFAULT: policy_kwargs = dict() else: policy_kwargs = dict(act_fun=ACT_FUN, net_arch=NET_ARCH) if ALGORITHM == 'ppo2': model = PPO2(policy=MlpPolicy, env=env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=log_dir) elif ALGORITHM == 'a2c': model = A2C(policy=MlpPolicy, env=env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=log_dir) else: raise Exception('Specify proper algorithm') model_arch = model.get_parameter_list() print('\n--------------- Summary of archs ---------------') for model_param in model_arch: print(model_param) print('\n') return model
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close()
def main(): alg_input = input("Select algorithm (PPO2 or A2C only):") if alg_input != "PPO2" and alg_input != "A2C" and alg_input != "ppo2" and alg_input != "a2c": print("Not an option (PPO2 or A2C only) !") alg_input = input("Select algorithm (PPO2 or A2C only):") model_input = "trained_agents\\" + input( "Select model to test(input filename, eg. a2c_wf_2):") env = gym.make("WARFLEET-v0") # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: env]) log_dir = "./logs/" done = False stage_reward = 0 turns = 0 if alg_input == "PPO2" or alg_input == "ppo2": model = PPO2.load(model_input, env=env, tensorboard_log=log_dir) elif alg_input == "A2C" or alg_input == "a2c": model = A2C.load(model_input, env=env, tensorboard_log=log_dir) obs = env.reset() while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) stage_reward += reward turns = turns + 1 # env.render() print("Reward: {} /42".format(stage_reward)) print("Turns: {}".format(turns)) env.close()
def test_a2c_update_n_batch_on_load(tmp_path): env = make_vec_env("CartPole-v1", n_envs=2) model = A2C("MlpPolicy", env, n_steps=10) model.learn(total_timesteps=100) model.save(os.path.join(str(tmp_path), "a2c_cartpole.zip")) del model model = A2C.load(os.path.join(str(tmp_path), "a2c_cartpole.zip")) test_env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) model.set_env(test_env) assert model.n_batch == 10 model.learn(100) os.remove(os.path.join(str(tmp_path), "a2c_cartpole.zip"))
def run_agent(envs, parameters): '''Train an agent.''' path = Path(parameters['path']) dummy_env = OptVecEnv(envs) set_global_seeds(parameters.setdefault('seed')) save_path = str(path / 'model.pkl') alg = parameters['alg'] if alg == 'PPO': with open(save_path, 'rb') as pkl: model = PPO2.load(pkl, env=dummy_env) elif alg == 'A2C': with open(save_path, 'rb') as pkl: model = A2C.load(pkl, env=dummy_env) try: done = False observations = dummy_env.reset() while not done: action = model.predict(observations) print(action[0].ravel().tolist()) observations, rewards, dones, infos = dummy_env.step(action[0]) done = any(dones) info = infos[0] yield info['weights'] finally: dummy_env.close()
def evaluate(modelname, env): n_cores = 4 obs = env.reset() model = A2C.load(modelname) wr = 0 win = 0 total_health_diff = 0 loss = 0 episodes = 0 total_episodes = 100 while episodes < total_episodes: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) #print(rewards[0]) time.sleep(.04) # print(rewards[0]) env.render(mode="human") for i in range(4): if dones[i] == True: if info[i]["p1_health"] < info[i]["p2_health"]: loss += 1 else: win += 1 total_health_diff += info[i]["p1_health"] - info[i]["p2_health"] wr = win / (win + loss) episodes += 1 return wr, total_health_diff / total_episodes
def model_training_learning(env_train, model_name, timesteps=100000): # train model os.chdir("./model_saved/" + model_name) start = time.time() print("Train ", model_name, " Model with MlpPolicy: ") if model_name == "A2C_Model": model = A2C('MlpPolicy', env_train, verbose=0) elif model_name == "PPO_Model": model = PPO2('MlpPolicy', env_train, verbose=0) elif model_name == "TD3_Model": model = TD3('MlpPolicy', env_train, verbose=0) elif model_name == "SAC_Model": model = SAC('MlpPolicy', env_train, verbose=0) print("Learning ", model_name, " time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("- ", model_name, " save finish :") print("Training time ", model_name, " : ", (end - start) / 60, " minutes") os.chdir("./..") os.chdir("./..") return model
def get_a2c(vec_env=None, policy='CnnPolicy', learning_rate=7e-4, momentum=0.0, alpha=0.99, epsilon=1e-5, max_grad_norm=0.5, lr_schedule='constant') -> A2C: """ Parameter's default values are taken from stable_baselines.a2c.a2c.py """ if vec_env is None: vec_env = create_training_env(1) return A2C(policy=policy, env=vec_env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=max_grad_norm, learning_rate=learning_rate, alpha=alpha, momentum=momentum, epsilon=epsilon, lr_schedule=lr_schedule, verbose=2)
def test_evaluate_policy(): model = A2C('MlpPolicy', 'Pendulum-v0', seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 model.n_callback_calls = 0 def dummy_callback(locals_, _globals): locals_['model'].n_callback_calls += 1 _, episode_lengths = evaluate_policy(model, model.get_env(), n_eval_episodes, deterministic=True, render=False, callback=dummy_callback, reward_threshold=None, return_episode_rewards=True) n_steps = sum(episode_lengths) assert n_steps == n_steps_per_episode * n_eval_episodes assert n_steps == model.n_callback_calls # Reaching a mean reward of zero is impossible with the Pendulum env with pytest.raises(AssertionError): evaluate_policy(model, model.get_env(), n_eval_episodes, reward_threshold=0.0) episode_rewards, _ = evaluate_policy(model, model.get_env(), n_eval_episodes, return_episode_rewards=True) assert len(episode_rewards) == n_eval_episodes
def train_agent(train, pickle_file, agent_type, env_kwargs, parms): bin_path = "bin/" + pickle_file if (path.exists(bin_path)): if agent_type == "a2c": print("Loading A2C Agent") RL_model = A2C.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ddpg": print("Loading DDPG Agent") RL_model = DDPG.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ppo": print("Loading PPO2 Agent") RL_model = PPO2.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") else: e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() agent = ipagent.IPRLAgent(env=env_train) model = agent.get_model(model_name=agent_type, model_kwargs=parms) RL_model = agent.train_model(model=model, tb_log_name=agent_type, total_timesteps=1000000) RL_model.save(bin_path) return RL_model
def __init__(self, method, K=5, P=0.95): self.method = method self.K = K self.state_size = self.K + 1 self.action_size = self.K + 1 self.reward = [] env_name = 'ErdosAttack-v0' self.log_dir = "/tmp/gym_attack/" os.makedirs(self.log_dir, exist_ok=True) env = gym.make(env_name) env.init_params(K, P) env = Monitor(env, self.log_dir, allow_early_resets=True) self.envs = DummyVecEnv([lambda: env]) if method == 'PPO': self.model = PPO2(MLP_PPO, self.envs, verbose=0) elif method == 'DQN': self.model = DQN(MLP_DQN, self.envs, verbose=0) elif method == 'A2C': self.model = A2C(MLP_A2C, self.envs, verbose=0) else: raise Exception("Erreur ! Méthode: 'PPO' ou 'DQN' ou 'A2C") print("Model Initialized !") self.best_mean_reward, self.n_steps = -np.inf, 0
def test_save_callback(self): ''' Test that the model performance can be monitored and results can be checked and saved as the model improves. This test trains an agent for a short period of time, without loading a pre-trained model. Therefore, this test also checks that a RL from stable-baselines can be trained. ''' # Define logging directory. Monitoring data and agent model will be stored here log_dir = os.path.join(utilities.get_root_path(), 'examples', 'agents', 'monitored_A2C') # Perform a short training example with callback env, _, _ = run_save_callback.train_A2C_with_callback( log_dir=log_dir, tensorboard_log=None) # Load the trained agent model = A2C.load(os.path.join(log_dir, 'best_model')) # Test one step with the trained model obs = env.reset() df = pd.DataFrame([model.predict(obs)[0][0]], columns=['value']) df.index.name = 'keys' ref_filepath = os.path.join(utilities.get_root_path(), 'testing', 'references', 'save_callback.csv') self.compare_ref_values_df(df, ref_filepath) # Remove model to prove further testing shutil.rmtree(log_dir, ignore_errors=True)
def load(config, agent, epoch, from_disk=True): config = config['ai'] if not config['enabled']: logging.info("ai disabled") return False logging.info("[ai] bootstrapping dependencies ...") from stable_baselines import A2C from stable_baselines.common.policies import MlpLstmPolicy from stable_baselines.common.vec_env import DummyVecEnv import pwnagotchi.ai.gym as wrappers env = wrappers.Environment(agent, epoch) env = DummyVecEnv([lambda: env]) logging.info("[ai] bootstrapping model ...") a2c = A2C(MlpLstmPolicy, env, **config['params']) if from_disk and os.path.exists(config['path']): logging.info("[ai] loading %s ..." % config['path']) a2c.load(config['path'], env) else: logging.info("[ai] model created:") for key, value in config['params'].items(): logging.info(" %s: %s" % (key, value)) return a2c
def load_a2c(): loaded_model = A2C.load(save_dir + "/A2C_tutorial") print("loaded", loaded_model.predict(obs, deterministic=True)) print("load gamma=", loaded_model.gamma, ", n_steps=", loaded_model.n_steps) # 模型保存模型的超参数和网络参数, 但不保存环境 env. 在 load 模型后需要重新设置环境. loaded_model.set_env(DummyVecEnv([lambda: gym.make("Pendulum-v0")])) loaded_model.learn(8000)
def test_monitor(): env = gym.make('Pendulum-v0') env = Monitor(gym.make('Pendulum-v0'), filename=None, allow_early_resets=True) normalized_env = NormalizeActionWrapper(env) normalized_env = DummyVecEnv([lambda: normalized_env]) # model model_2 = A2C('MlpPolicy', normalized_env, verbose=1).learn(1000)
def NewPotential(current_window, algorithm='PPO'): # Determine the pretrained agent if algorithm == 'A2C': model = A2C.load("pretrained_A2C") elif algorithm == 'PPO': model = PPO2.load("pretrained_PPO") elif algorithm == 'ACKTR': model = ACKTR.load("pretrained_ACKTR") elif algorithm == 'ACER': model = ACER.load("pretrained_ACER") else: raise ValueError("%s is not a valid algorithm." % algorithm) if len(current_window) != model.observation_space.shape[0]: raise ValueError("%s is does not match the model's window size." % len(current_window)) action, _states = model.predict(current_window, deterministic=False) voltages = np.linspace(0, 1, num=model.action_space.n) if action >= 0 and action <= model.action_space.n - 1: voltage = voltages[action] else: raise ValueError( "Received invalid action={} which is not part of the action space". format(action)) return voltage
def a2c(env, seed): return A2C('MlpPolicy', env, learning_rate=0.001, verbose=1, tensorboard_log="./data/runs", seed=seed)