def _init_environment(self,datapath,window_size): df = pd.read_csv(datapath) bid_price_columns = [i for i in range(1,len(df.columns),20)] print(bid_price_columns) ask_price_columns = [i for i in range(3,len(df.columns),20)] bidPrices = df[df.columns[bid_price_columns]] askPrices = df[df.columns[bid_price_columns]] df_concat = pd.concat([bidPrices, askPrices]) midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):] print(midPrices[:,0]) self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)]) self.env = VecCheckNan(self.env, raise_exception=True) n_actions = self.env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) print(n_actions) if(self.policy == "DDPG"): self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise) elif(self.policy=="TD3"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) elif(self.policy=="GAIL"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) else: self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose)) if self.load: #load model self.model = self.model.load("save/"+modelpath+".h5") #init model class self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
def td3(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines.ddpg.noise import NormalActionNoise env = gym.make(env_id) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if load_weights is not None: model = TD3.load(load_weights, env, verbose=0) else: model = TD3(policy, env, action_noise=action_noise, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="td3", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def get_TD3_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path): policy_kwargs = dict(layers=model_settings['NET_LAYERS']) env = get_single_process_env(model_settings, model_path, ckpt_step) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if ckpt_path is not None: print("Loading model from checkpoint '{}'".format(ckpt_path)) model = TD3.load(ckpt_path, env=env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], action_noise=action_noise, verbose=1, tensorboard_log=tb_path) model.num_timesteps = ckpt_step else: model = TD3(TD3MlpPolicy, env, _init_setup_model=True, policy_kwargs=policy_kwargs, action_noise=action_noise, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) return model, env
def train_TD3(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir,log_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir+'/', allow_early_resets=True) policy = kwargs['policy'] n_timesteps = kwargs['n_timesteps'] noise_type = kwargs['noise_type'] del kwargs['policy'] del kwargs['n_timesteps'] del kwargs['noise_type'] ''' Parameter space noise: injects randomness directly into the parameters of the agent, altering the types of decisions it makes such that they always fully depend on what the agent currently senses. ''' # the noise objects for TD3 nb_actions = env.action_space.shape[-1] action_noise = None if not noise_type is None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) if 'continue' in kwargs and kwargs['continue'] is True: # Continue training print("Loading pretrained agent") # Policy should not be changed del kwargs['policy'] model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir,'tb'), verbose=1, **kwargs) else: if 'continue' in kwargs: del kwargs['continue'] model = TD3(policy, env, action_noise=action_noise, seed=seed, verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
def model_training_learning(env_train, model_name, timesteps=100000): # train model os.chdir("./model_saved/" + model_name) start = time.time() print("Train ", model_name, " Model with MlpPolicy: ") if model_name == "A2C_Model": model = A2C('MlpPolicy', env_train, verbose=0) elif model_name == "PPO_Model": model = PPO2('MlpPolicy', env_train, verbose=0) elif model_name == "TD3_Model": model = TD3('MlpPolicy', env_train, verbose=0) elif model_name == "SAC_Model": model = SAC('MlpPolicy', env_train, verbose=0) print("Learning ", model_name, " time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("- ", model_name, " save finish :") print("Training time ", model_name, " : ", (end - start) / 60, " minutes") os.chdir("./..") os.chdir("./..") return model
def __call__(self): policy_kwargs = dict(layers=[400, 300, 200, 100]) n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # check_env(self.env) model = TD3(MlpPolicy, self.env, policy_kwargs=policy_kwargs, action_noise=action_noise, memory_limit=50000, tensorboard_log= "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log", verbose=1) time_steps = 3e4 model.learn(total_timesteps=int(time_steps), log_interval=50, tb_log_name="td3_Docker_" + self.expt_name) model.save( "/home/dfki.uni-bremen.de/mpatil/Documents/td3_stable_baselines_" + self.expt_name) print("Closing environment") self.env.close()
def load_model(path: str, env, desc: str): """ Loads a model from a stable baseline checkpoint file into a memory representation Args: path (str) : Path to the Stable Baseline Checkpoint File env (SB Env) : Path to the Stable Baseline Checkpoint File desc (str) : Text Description of what model this is Returns: The loaded model """ if desc == "ddpg": return DDPG.load(path, env) elif desc == "ppo": env = DummyVecEnv([lambda: env]) return PPO2.load(path, env) elif desc == "trpo": env = DummyVecEnv([lambda: env]) return TRPO.load(path, env) elif desc == "td3": return TD3.load(path, env) elif desc == "sac": return SAC.load(path, env) else: raise RuntimeError(f"Model Name {desc} not supported")
def run_stable(num_steps, save_dir): env = make_vec_env(BBall3Env, n_envs=1, monitor_dir=save_dir, env_kwargs=env_config) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions)) model = TD3( MlpPolicy, env, action_noise=action_noise, verbose=1, gamma=0.99, buffer_size=1000000, learning_starts=10000, batch_size=100, learning_rate=1e-3, train_freq=1000, gradient_steps=1000, policy_kwargs={"layers": [64, 64]}, n_cpu_tf_sess=1, ) num_epochs = 1 total_steps = 5e5 for epoch in range(num_epochs): model.learn(total_timesteps=int(total_steps / num_epochs)) model.save(save_dir + "/model.zip")
def deploy_trained_model(self): # Load neural network policy from stable_baselines3 import TD3 src_file = os.path.split( os.path.split( os.path.join(os.path.dirname( os.path.realpath(__file__))))[0])[0] try: model = TD3.load( os.path.join(src_file, "algos/SB/agents/QUAD_TD3_OPTUNA_policy")) except: model = None print("Failed to load nn. ") obs = self.reset() while True: velocity_target = self.get_input_target() if self.config["controller_source"] == "nn": if model == None: act = np.random.rand(self.act_dim) * 2 - 1 else: act, _states = model.predict(obs, deterministic=True) else: act = self.calculate_stabilization_action( obs[3:7], obs[10:13], velocity_target) obs, r, done, _ = self.step(act) if done: obs = self.reset()
def train_TD3(env_train, model_name, model=None, timesteps=30000, save_path=None): """TD3 model""" # add the noise objects for TD3 n_actions = env_train.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) start = time.time() if model is None: model = TD3('MlpPolicy', env_train, action_noise=action_noise) else: model.set_env(env_train) model.verbose = config.VERBOSE model.learn(total_timesteps=timesteps) end = time.time() if save_path is None: save_path = f"{config.TRAINED_MODEL_DIR}/{model_name}" model.save(save_path) print('Training time (TD3): ', (end - start) / 60, ' minutes') return model
def train_TD3(self, model_name, model_params=config.TD3_PARAMS): """TD3 model""" from stable_baselines import TD3 from stable_baselines.common.noise import NormalActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) start = time.time() model = TD3('MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], learning_rate=model_params['learning_rate'], action_noise=action_noise, verbose=model_params['verbose']) model.learn(total_timesteps=model_params['timesteps']) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def test_TD3( env, out_dir, seed=None, **kwargs): model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env) #model.learn(total_timesteps=10000) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=5000) return
def test_td3(): model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), seed=0, learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=10000, eval_freq=5000)
def train_TD3(env_train, model_name, timesteps=50000): """TD3 model""" start = time.time() model = TD3('MlpPolicy', env_train) model.learn(total_timesteps=timesteps, log_interval=10) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def read_model(model_type): if model_type == "A2C": model = A2C.load( "./model_saved/Selected/A2C_ModelMar-05-2021_0815/A2C_ModelMar-05-2021_0815" ) if model_type == "TD3": model = TD3.load( "./model_saved/Selected/TD3_ModelMar-05-2021_1442/TD3_ModelMar-05-2021_1442" ) return model
def load_model(config): model = None if config["algo_name"] == "TD3": model = TD3.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "A2C": model = A2C.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "SAC": model = SAC.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "PPO2": model = PPO2.load("agents/{}".format(args["test_agent_path"])) assert model is not None, "Alg name not found, cannot load model, exiting. " return model
def f_checkpoints_range_2_mean_performance( self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]: logging.debug( f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}" ) rewards = np.zeros(len(checkpoints)) s_rates = np.zeros(len(checkpoints)) # Intent # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint # - Pass that model to `mean_eval` evaluation function which will evaluate the model on # - a certain number of episodes # - a certain env # - continuous or not continuous space # - an evaluation returns reward and average success rate # # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates j = 0 """ NOTE: i can range in anyway while j iterates over the numpy array """ for i in checkpoints: path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}" logging.debug(f"Evaluating model at {path}") if self.args.model['name'] == "ddpg": model = DDPG.load(path) elif self.args.model['name'] == "ppo": model = PPO2.load(path) elif self.args.model['name'] == "trpo": model = TRPO.load(path) elif self.args.model['name'] == "td3": model = TD3.load(path) elif self.args.model['name'] == "sac": model = SAC.load(path) logging.debug( f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}" ) rewards_list, success_rates_list = mean_eval( num_episodes=self.args.n_episodes, checkpoint_id=i, model=model, env=self.env, v=True, continuous=self.args.continuous, plots_dir=self.args.plots_dir) rewards_mean = np.mean(rewards_list) success_rates_mean = np.mean(success_rates_list) logging.debug( f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}" ) rewards[j] = rewards_mean s_rates[j] = success_rates_mean j += 1 return rewards, s_rates
def td3(env, seed): n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions)) return TD3('MlpPolicy', env, learning_rate=0.001, action_noise=action_noise, verbose=1, tensorboard_log="./data/runs", seed=seed)
def explore(app, emulator, appium, timesteps, timer, save_policy, policy_dir, cycle, train_freq=10, random_exploration=0.8): try: env = TimeFeatureWrapper(app) model = TD3(MlpPolicy, env, verbose=1, train_freq=train_freq, random_exploration=random_exploration) callback = TimerCallback(timer=timer) model.learn(total_timesteps=timesteps, callback=callback) if save_policy: model.save(f'{policy_dir}{os.sep}{cycle}') return True except Exception: appium.restart_appium() if emulator is not None: emulator.restart_emulator() return False
def __call__(self, trial): # Calculate an objective value by using the extra arguments. env_id = 'gym_custom:fooCont-v0' env = gym.make(env_id, data=self.train_data) env = DummyVecEnv([lambda: env]) algo = trial.suggest_categorical('algo', ['TD3']) model = 0 if algo == 'PPO2': policy_choice = trial.suggest_categorical('policy', [False, True]) policy = commonMlp if policy_choice else commonMlpLstm model_params = optimize_ppo2(trial) model = PPO2(policy, env, verbose=0, nminibatches=1, **model_params) model.learn(276*7000) elif algo == 'DDPG': policy_choice = trial.suggest_categorical('policy', [False, True]) policy = ddpgLnMlp model_params = sample_ddpg_params(trial) model= DDPG(policy, env, verbose=0, **model_params) model.learn(276*7000) elif algo == 'TD3': policy_choice = trial.suggest_categorical('policy', [False, True]) policy = td3MLP if policy_choice else td3LnMlp model_params = sample_td3_params(trial) model = TD3(policy, env, verbose=0, **model_params) model.learn(276*7000*3) rewards = [] reward_sum = 0.0 env = gym.make(env_id, data=self.test_data) env = DummyVecEnv([lambda: env]) obs = env.reset() for ep in range(1000): for step in range(276): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 obs = env.reset()
def do_rollout_stable(init_point=None): env = gym.make(env_name, **config) td3_model = TD3.load( script_path + "../rl-baselines-zoo/baseline_log2/td3/su_acrobot_cdc-v0_2/su_acrobot_cdc-v0.zip" ) if init_point is not None: obs = env.reset(init_point) else: obs = env.reset() obs = torch.as_tensor(obs, dtype=torch.float32) acts_list = [] obs1_list = [] rews_list = [] dtype = torch.float32 act_size = env.action_space.shape[0] obs_size = env.observation_space.shape[0] done = False cur_step = 0 while not done: acts = td3_model.predict(obs.reshape(-1, obs_size))[0] for _ in range(20): obs, rew, done, out = env.step(acts) # env.render() obs1_list.append(obs) obs = torch.as_tensor(obs, dtype=dtype) acts_list.append(torch.as_tensor(acts)) rews_list.append(torch.as_tensor(rew, dtype=dtype)) cur_step += 1 ep_obs1 = torch.tensor(obs1_list).reshape(-1, 4) ep_acts = torch.stack(acts_list).reshape(-1, act_size) ep_rews = torch.stack(rews_list).reshape(-1, 1) return ep_obs1, ep_acts, ep_rews, None, ep_obs1
def test_deterministic_td3(): results = [[], []] rewards = [[], []] kwargs = {'n_cpu_tf_sess': 1} env_id = 'Pendulum-v0' kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)}) for i in range(2): model = TD3('MlpPolicy', env_id, seed=SEED, **kwargs) model.learn(N_STEPS_TRAINING) env = model.get_env() obs = env.reset() for _ in range(20): action, _ = model.predict(obs, deterministic=True) obs, reward, _, _ = env.step(action) results[i].append(action) rewards[i].append(reward) # without the extended tolerance, test fails for unknown reasons on Github... assert np.allclose(results[0], results[1], rtol=1e-2), results assert np.allclose(rewards[0], rewards[1], rtol=1e-2), rewards
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_TD3(trial) env = SubprocVecEnv([ lambda: NormalizeActionWrapper(LearningRocket(visualize=False)) for i in range(n_cpu) ]) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, policy_kwargs=dict(layers=[400, 300])) model.learn(50000) rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() step = 0 while n_episodes < 4: step += 1 action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, step) return -1 * last_reward
def train_TD3(env_train, model_name, timesteps=100000): # train TD3 model os.chdir("./model_saved/") start = time.time() print("Train TD3 Model with MlpPolicy: ") model = TD3('MlpPolicy', env_train, verbose=0) print("TD3 Learning time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("TD3 Model save finish :") print('Training time TD3: ', (end - start) / 60, ' minutes') os.chdir("./..") return model
def td3(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): from stable_baselines.ddpg.noise import NormalActionNoise env = gym.make(env_id) # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(policy, env, action_noise=action_noise, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=timesteps, log_interval=log_interval) save_model_weights(model, "td3", env_id, policy, seed)
arg.LR_STOP = 0.1 arg.lr_gamma = 0.95 arg.PI_STD=1 arg.goal_radius_range=[0.1,0.3] arg.TERMINAL_VEL = 0.025 arg.goal_radius_range=[0.15,0.3] arg.std_range = [0.02,0.3,0.02,0.3] arg.TERMINAL_VEL = 0.025 # terminal velocity? # norm(action) that you believe as a signal to stop 0.1. arg.DELTA_T=0.2 arg.EPISODE_LEN=35 number_updates=100 # agent convert to torch model import policy_torch baselines_mlp_model =TD3.load('trained_agent/accac_final_1000000_9_11_20_25.zip') agent = policy_torch.copy_mlp_weights(baselines_mlp_model,layers=[512,512],n_inputs=32) # loading enviorment, same as training env=firefly_accac.FireflyAccAc(arg) # ---seting the env for inverse---- # TODO, move it to a function of env env.agent_knows_phi=False for i in range(10): filename=(str(time.localtime().tm_mday)+'_'+str(time.localtime().tm_hour)+'_'+str(time.localtime().tm_min)) single_theta_inverse(arg, env, agent, filename, number_updates=number_updates, true_theta=None, phi=None,
stepIdx, currIt = 0, 0 try: # model = PPO2.load(f'rsu_agents/{scenario_name}_agents/' # f'PPO2_ns3_online_{scenario_name}_cars={num_of_vehicles}') # model = PPO2.load( # (f'rsu_agents/single_lane_highway_agents/optimized_interval/PPO2_ns3_single_lane_highway_cars=25_optimized')) # model = SAC.load( # (f'rsu_agents/single_lane_highway_agents/optimized_interval/SAC_ns3_single_lane_highway_cars=25_optimized')) model = TD3.load( f'rsu_agents/single_lane_highway_agents/optimized_interval/TD3_ns3_single_lane_highway_cars=25_optimized' ) # model = PPO2.load( # (f'rsu_agents/square_agents/optimized_interval/PPO2_ns3_square_cars=25_optimized')) # model = SAC.load( # (f'rsu_agents/square_agents/optimized_interval/SAC_ns3_square_cars=25_optimized')) # model = TD3.load( # f'rsu_agents/square_agents/optimized_interval/TD3_ns3_square_cars=25_optimized') while True: print("Start iteration: ", currIt) obs = env.reset() reward = 0
def train_initial_policy( model_name, algo=ALGO, env_name=ENV_NAME, time_steps=TIME_STEPS): """Uses the specified algorithm on the target environment""" print("Using algorithm : ", algo.__name__) print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl") # define the environment here env = gym.make(env_name) env.seed(SEED) if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE) if MUJOCO_NORMALIZE: env = MujocoNormalized(env) print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high) print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high) if algo.__name__ == "ACKTR": print('Using SubprovVecEnv') env = SubprocVecEnv([lambda: env for i in range(8)]) elif algo.__name__ == "SAC": print('Using standard gym environment') env = env else: print('Using Dummy Vec Env') env = DummyVecEnv([lambda : env]) if NORMALIZE : env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_reward=1e6, ) with open('data/target_policy_params.yaml') as file: args = yaml.load(file, Loader=yaml.FullLoader) args = args[algo.__name__][PARAMS_ENV] print('~~ Loaded args file ~~') if algo.__name__ == "SAC": print('Initializing SAC with RLBaselinesZoo hyperparameters .. ') print('using 256 node architecture as in the paper') class CustomPolicy(ffp_sac): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[256, 256]) model = SAC(CustomPolicy, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TD3": print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(args['noise_std']) * np.ones(n_actions)) class CustomPolicy2(ffp_td3): def __init__(self, *args, **kwargs): super(CustomPolicy2, self).__init__(*args, **kwargs, feature_extraction="mlp", layers=[400, 300]) model = TD3(CustomPolicy2, env, verbose = 1, tensorboard_log = 'data/TBlogs/initial_policy_training', batch_size = args['batch_size'], buffer_size = args['buffer_size'], gamma = args['gamma'], gradient_steps = args['gradient_steps'], learning_rate = args['learning_rate'], learning_starts = args['learning_starts'], action_noise = action_noise, train_freq=args['train_freq'], seed=SEED, ) elif algo.__name__ == "TRPO": print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ') # hyperparameters suggestions from : # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml model = TRPO(mlp_standard, env, verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', timesteps_per_batch=args['timesteps_per_batch'], lam=args['lam'], max_kl=args['max_kl'], gamma=args['gamma'], vf_iters=args['vf_iters'], vf_stepsize=args['vf_stepsize'], entcoeff=args['entcoeff'], cg_damping=args['cg_damping'], cg_iters=args['cg_iters'], seed=SEED, ) elif algo.__name__ == "ACKTR": print('Initializing ACKTR') model = ACKTR(mlp_standard, env, verbose=1, n_steps=128, ent_coef=0.01, lr_schedule='constant', learning_rate=0.0217, max_grad_norm=0.5, gamma=0.99, vf_coef=0.946, seed=SEED) elif algo.__name__ == "PPO2": print('Initializing PPO2') print('Num envs : ', env.num_envs) model = PPO2(mlp_standard, env, n_steps=int(args['n_steps']/env.num_envs), nminibatches=args['nminibatches'], lam=args['lam'], gamma=args['gamma'], ent_coef=args['ent_coef'], noptepochs=args['noptepochs'], learning_rate=args['learning_rate'], cliprange=args['cliprange'], verbose=1, tensorboard_log='data/TBlogs/initial_policy_training', seed=SEED, ) else: print('No algorithm matched. Using SAC .. ') model = SAC(CustomPolicy, env, verbose=1, batch_size=args['batch_size'], buffer_size=args['buffer_size'], ent_coef=args['ent_coef'], learning_starts=args['learning_starts'], learning_rate=args['learning_rate'], train_freq=args['train_freq'], seed=SEED, ) # change model name if using normalization if NORMALIZE: model_name = model_name.replace('.pkl', 'normalized_.pkl') elif MUJOCO_NORMALIZE: model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl') if SAVE_BEST_FOR_20: model.learn(total_timesteps=time_steps, tb_log_name=model_name, log_interval=10, callback=eval_callback) save_the_model() model_name = model_name.replace('best_', '') model.save(model_name) elif SAVE_INTERMEDIATE: check_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name[:-4], name_prefix=ENV_NAME + '_' + str(SEED), verbose=1, ) eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)]) eval_env.seed(SEED) eval_callback = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=SAVE_FREQ, log_path=model_name[:-4], deterministic=False, render=False, verbose=1) callbacks = CallbackList([check_callback, eval_callback]) model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10, callback=callbacks) model.save(model_name) npzfile = np.load(model_name[:-4] + '/evaluations.npz') average_rewards = np.mean(npzfile['results'], axis=1)[:, 0] with open(model_name[:-4] + "/eval_results.txt", "a") as f: for i in range(np.shape(average_rewards)[0]): f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i])) evaluate_policy_on_env(env, model, render=False, iters=50) else: model.learn(total_timesteps=time_steps, tb_log_name=model_name.split('/')[-1], log_interval=10,) model.save(model_name) evaluate_policy_on_env(env, model, render=False, iters=50) # save the environment params if NORMALIZE: # env.save(model_name.replace('.pkl', 'stats_.pkl')) env.save('data/models/env_stats/'+env_name+'.pkl') print('done :: ', model_name) exit()
arg.NUM_SAMPLES = 2 arg.NUM_EP = 200 arg.NUM_IT = 2 # number of iteration for gradient descent arg.NUM_thetas = 1 arg.ADAM_LR = 0.1 arg.LR_STEP = 2 arg.LR_STOP = 0.003 arg.lr_gamma = 0.95 arg.PI_STD = 1 arg.goal_radius_range = [0.1, 0.3] arg.TERMINAL_VEL = 0.025 number_updates = 100 # agent convert to torch model import policy_torch baselines_mlp_model = TD3.load( 'trained_agent//acc_retrain_1000000_2_18_21_4.zip') agent = policy_torch.copy_mlp_weights(baselines_mlp_model, layers=[128, 128], n_inputs=30) # loading enviorment, same as training env = firefly_acc.FireflyAcc(arg) # ---seting the env for inverse---- # TODO, move it to a function of env env.agent_knows_phi = False for i in range(10): filename = ("test_acc_EP" + str(arg.NUM_EP) + "updates" + str(number_updates) + "lr" + str(arg.ADAM_LR) + 'step' + str(arg.LR_STEP) + str(time.localtime().tm_mday) + '_' + str(time.localtime().tm_hour) + '_' +
class CustomTD3Policy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomTD3Policy, self).__init__(*args, **kwargs, layers=[400, 400], layer_norm=True, feature_extraction="mlp") model = TD3(CustomTD3Policy, env, verbose=1, action_noise=action_noise, learning_rate=0.001, gamma=0.99, buffer_size=1000000, batch_size=100, train_freq=1000, tensorboard_log="./gait2d_td3_tensorboard/") if args.train: model.learn(total_timesteps=args.steps, callback=eval_callback) model.save(args.model) else: model = TD3.load(args.model, env=env) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if args.visualize: