def _init_environment(self,datapath,window_size): df = pd.read_csv(datapath) bid_price_columns = [i for i in range(1,len(df.columns),20)] print(bid_price_columns) ask_price_columns = [i for i in range(3,len(df.columns),20)] bidPrices = df[df.columns[bid_price_columns]] askPrices = df[df.columns[bid_price_columns]] df_concat = pd.concat([bidPrices, askPrices]) midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):] print(midPrices[:,0]) self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)]) self.env = VecCheckNan(self.env, raise_exception=True) n_actions = self.env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) print(n_actions) if(self.policy == "DDPG"): self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise) elif(self.policy=="TD3"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) elif(self.policy=="GAIL"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) else: self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose)) if self.load: #load model self.model = self.model.load("save/"+modelpath+".h5") #init model class self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
def train_policy_ddpg(env, policy, policy_args, total_timesteps, verbose=0, actor_lr=.5, critic_lr=.001): """ Parameters ---------- env : vectorized set of EncoderWrapper of a TimeLimit wrapper of a restartable env. policy : ddpg policy class policy_args : dict of keyword arguments for policy class total_timesteps : int, how many timesteps to train policy (i.e. 200000) """ # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(policy, env, verbose=verbose, param_noise=param_noise, action_noise=action_noise, policy_kwargs=policy_args, actor_lr=actor_lr, critic_lr=critic_lr) #model = PPO2(policy, env) model.learn(total_timesteps) return model
def main(): # create Environment env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=False, useIK=1, isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0) # set seed seed = 1 tf.reset_default_graph() set_global_seed(seed) env.seed(seed) # set log monitor_dir = os.path.join(log_dir,'log') os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, monitor_dir+'/', allow_early_resets=True) # create agent model nb_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions)) model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16, normalize_observations=True,normalize_returns=False, memory_limit=100000, verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False) #start learning model.learn(total_timesteps=500000, seed=seed, callback=callback) # save model print("Saving model.pkl to ",log_dir) act.save(log_dir+"/final_model.pkl")
def ddpg(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines import DDPG env = gym.make(env_id) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) if load_weights is not None: model = DDPG.load(load_weights, env=env) else: model = DDPG(policy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="ddpg", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback) save_model_weights(model, "ddpg", env_id, policy, seed=seed, path=".")
def test_identity_ddpg(): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) std = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std)) model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6)) model.learn(total_timesteps=20000, seed=0) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials # Free memory del model, env
def run_test(config): """Stable baselines test Mandatory configuration settings: - 'continuous' agent - camera_settings enabled - stable_baselines enabled """ env = None try: # Create Environment env = make_env(config) env = DummyVecEnv([lambda: env]) # Initialize DDPG and start learning n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(CnnPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, random_exploration=0.8) model.learn(total_timesteps=10000) finally: if env: env.close() else: clear_carla(config.host, config.port) print("-----Carla Environment is closed-----")
def launchAgent(model_name: str): """ :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함 현재는 의도상 PPO2로 세팅할 것 :return: 1000회의 사이클을 돌고 난 이후의 모델 """ import Reinforcement_AI.env.e_enhanced_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env print("Current Env is " + model_name) if model_name == "HER": env = image_env.DetailedMiniMapEnv() model = HER("CnnPolicy", env=env, model_class=DQN) if model_name == "DDPG": env = image_env.DDPGImageEnv() model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True) if model_name == "PPO2": env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2(policy="CnnPolicy", env=env, verbose=1) else: env = image_env.DetailedMiniMapEnv() model = DQN( "CnnPolicy", # policy env=env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i), env) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i), env) if model_name == "PPO2": model = PPO2.load("detailedmap_PPO2_" + str(i), env) else: model = DQN.load("detailedmap_DQN_" + str(i), env) # print('model learn start') model.learn(total_timesteps=12500) #FPS가 130이상 넘어갈때의 최소수치 print("this model is : detailedmap_" + model_name + "_" + str(i + 1)) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i + 1)) del model # print('model save end') return model
def test_ddpg_eval_env(): """ Additional test to check that everything is working when passing an eval env. """ eval_env = gym.make("Pendulum-v0") model = DDPG("MlpPolicy", "Pendulum-v0", nb_rollout_steps=5, nb_train_steps=2, nb_eval_steps=10, eval_env=eval_env, verbose=0) model.learn(1000)
def run(self): self._init() env = self.env model = self.model objective = self.objective if objective == "infogain": wenv = InfogainEnv(env, model) elif objective == "prederr": wenv = PrederrEnv(env, model) else: raise AttributeError( "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'" .format(objective)) wenv.max_episode_len = self.horizon wenv.end_episode_callback = self._end_episode dvenv = DummyVecEnv([lambda: wenv]) if self.rl_algo == "ddpg": self.logger.info("Setting up DDPG as model-free RL algorithm.") pn = AdaptiveParamNoiseSpec() an = NormalActionNoise(np.array([0]), np.array([1])) rl_model = DDPG(DDPGMlpPolicy, dvenv, verbose=1, render=False, action_noise=an, param_noise=pn, nb_rollout_steps=self.horizon, nb_train_steps=self.horizon) elif self.rl_algo == "sac": self.logger.info("Setting up SAC as model-free RL algorithm.") rl_model = SAC(SACMlpPolicy, dvenv, verbose=1, learning_starts=self.horizon) else: raise AttributeError( "Model-free RL algorithm '{}' is unknown.".format( self.rl_algo)) # Train the agent max_steps_total = self.horizon * self.n_episodes * 100 try: self.logger.info("Start the agent") rl_model.learn(total_timesteps=max_steps_total, seed=self.seed) except MaxEpisodesReachedException: print("Exploration finished.")
def main(output_folder_path:Path): # Set gym-carla environment agent_config = AgentConfig.parse_file(Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file(Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLPIDAgent, "max_collision": 5 } env = gym.make('roar-pid-v0', params=params) env.reset() model_params: dict = { "verbose": 1, "render": True, "tensorboard_log": (output_folder_path / "tensorboard").as_posix() } latest_model_path = find_latest_model(output_folder_path) if latest_model_path is None: model = DDPG(LnMlpPolicy, env=env, **model_params) # full tensorboard log can take up space quickly else: model = DDPG.load(latest_model_path, env=env, **model_params) model.render = True model.tensorboard_log = (output_folder_path / "tensorboard").as_posix() logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=(output_folder_path / "checkpoints").as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList([checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"pid_ddpg_{datetime.now()}")
def my_compute_data(self, args, env, params, n_episodes): env = gym.make('gym_quadcopter:quadcopter-v' + str(args.env)) for alg, start_index, end_index, step, suffix in params: re_d = [] sr_d = [] rewards, s_rates = [], [] for i in range(start_index, end_index, step): print("") print( f"Working on alg={alg}, start_index={start_index}, end_index={end_index}, step={step}, suffix={suffix}, i={i}" ) path = f"{self.base_dir}models/{alg}/quadcopter-v{args.env}-{i}{suffix}.pkl" print(f"Evaluating model at {path}") if not os.path.exists(path): print(f"WARNING: File {path} does not exist --> SKIPPING") continue if alg == "ddpg": model = DDPG.load(path) elif alg == "ppo": model = PPO2.load(path) else: model = TRPO.load(path) r, su = mean_eval(n_episodes, model, env, False, False) print(f"Average Success Rate: {su}") rewards.append(r) s_rates.append(su[0]) i_max = np.argmax(s_rates) re_d.append(rewards) sr_d.append(s_rates) return re_d, sr_d
def load_model(path: str, env, desc: str): """ Loads a model from a stable baseline checkpoint file into a memory representation Args: path (str) : Path to the Stable Baseline Checkpoint File env (SB Env) : Path to the Stable Baseline Checkpoint File desc (str) : Text Description of what model this is Returns: The loaded model """ if desc == "ddpg": return DDPG.load(path, env) elif desc == "ppo": env = DummyVecEnv([lambda: env]) return PPO2.load(path, env) elif desc == "trpo": env = DummyVecEnv([lambda: env]) return TRPO.load(path, env) elif desc == "td3": return TD3.load(path, env) elif desc == "sac": return SAC.load(path, env) else: raise RuntimeError(f"Model Name {desc} not supported")
def test_build_chain(): chain_length = 1000 f16_model = F16TFModel(chain_length) # f16_model.sess.run([f16_model.roll_state_assign, f16_model.pull_state_assign, f16_model.done_state_assign], # feed_dict={f16_model.roll_state_placeholder: 2.00-.001, # f16_model.pull_state_placeholder: 3.27-.001, # f16_model.done_state_placeholder: 9.98-.001}) env = gym.make("F16GCAS-v0") ob = env.reset() x0 = env.states[-1] class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") DVenv = DummyVecEnv([lambda: f16_model.env]) model = DDPG.load(ROOT + "/trained_models/TDRL/f16/ddpg/128_128", env=DVenv, policy=CustomPolicy) assign_x0 = tf.assign(f16_model.x0, x0) f16_model.sess.run(assign_x0) f16_model.update_change_points() x_out = f16_model.sess.run(f16_model.xt_list[-1]) trace, reward = simulation_with_nn(env, chain_length, model, x0, mute=True) print("") print(x_out - env.states[-1])
def _get_weights(self): class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") DVenv = DummyVecEnv([lambda: self.env]) self.nn_model = DDPG.load(ROOT + "/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy) with self.nn_model.graph.as_default(): # print(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi')) # print(tf.all_variables()) # train_writer = tf.summary.FileWriter('./neural_network_graph', model.sess.graph) wb_list = self.nn_model.sess.run( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi')) self.w_list = [] self.b_list = [] count = 0 with tf.name_scope("neural_controller"): for wb in wb_list: if count % 2 == 0: self.w_list.append(tf.convert_to_tensor(wb, name="w")) else: self.b_list.append(tf.convert_to_tensor(wb, name="b")) count += 1
def test(model_path: str, exp_config: dict): test_env, _ = init_env(exp_config) if ALG == 'ddpg': model = DDPG.load(model_path, env=test_env) elif ALG == 'trpo': model = TRPO.load(model_path, env=test_env) elif ALG == 'ppo2': model = PPO2.load(model_path, env=test_env) elif ALG == 'her': # model = HER.load(model_path, env=test_env) raise NotImplemented() else: raise ValueError(f'Unknown algorithm "{ALG}"!') monitor = test_env.envs[0] # type: Monitor assert isinstance(monitor, Monitor) raw_env = monitor.unwrapped # type: GaussianPendulumEnv assert isinstance(raw_env, GaussianPendulumEnv) raw_env.configure(seed=42, mass_mean=(0.05, 1.5), mass_stdev=(0.01, 0.15), embed_knowledge=exp_config.get('embed_knowledge', False), perfect_knowledge=exp_config.get('perfect_knowledge', False), gym_env=test_env) runs = np.zeros((TEST_RUNS, 4)) fixed_masses = np.linspace(0.030, 1.600, TEST_RUNS) for test_ep in range(runs.shape[0]): obs = test_env.reset() if TEST_LINSPACE_MASS: p = raw_env.physical_props raw_env.physical_props = p[0], fixed_masses[test_ep], p[2] mass_distr_params = raw_env.mass_distr_params.copy() sampled_mass = raw_env.physical_props[1] while True: action, states = model.predict(obs, deterministic=True) obs, rewards, dones, info = test_env.step(action) rewards_by_episode = monitor.episode_rewards episode = len(rewards_by_episode) if episode != test_ep: break last_tot_reward = rewards_by_episode[-1] runs[test_ep, :] = mass_distr_params[0], mass_distr_params[ 1], sampled_mass, last_tot_reward avg_reward = runs[:, 3].mean() print(f'Avg. test reward: {avg_reward}\n') return runs
def test_ddpg_normalization(): """ Test that observations and returns normalizations are properly saved and loaded. """ param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=0.05) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, param_noise=param_noise) model.learn(1000) obs_rms_params = model.sess.run(model.obs_rms_params) ret_rms_params = model.sess.run(model.ret_rms_params) model.save('./test_ddpg.zip') loaded_model = DDPG.load('./test_ddpg.zip') obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params) ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params) for param, param_loaded in zip(obs_rms_params + ret_rms_params, obs_rms_params_2 + ret_rms_params_2): assert np.allclose(param, param_loaded) del model, loaded_model if os.path.exists("./test_ddpg.zip"): os.remove("./test_ddpg.zip")
def train_agent_with_ddpg(load): from stable_baselines.ddpg.policies import FeedForwardPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG # Create and wrap the environment env = gym.make('F16GCAS-v0') env = DummyVecEnv([lambda: env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions)) # Custom MLP policy of two layers of size 16 each class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise) if not load: ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128") else: model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env) return model
def test_DDPG(env, out_dir, seed=None, **kwargs): model = DDPG.load(os.path.join(out_dir, 'final_model.pkl'), env=env) #model.learn(total_timesteps=10000) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=5000) return
def plot_path_ddpg(modelname, env, num_episode=None): from stable_baselines import DDPG num_episode = 20 if num_episode is None else num_episode agent = DDPG.load(modelname, env=env) # create saving vars all_ep = [] # for ecah episode, for i in range(num_episode): ep_data = {} ep_statex = [] ep_statey = [] ep_belifx = [] ep_belify = [] # get goal position at start decisioninfo = env.reset() goalx = env.goalx goaly = env.goaly ep_data['goalx'] = goalx ep_data['goaly'] = goaly # log the actions raw, v and w while not env.stop: action, _ = agent.predict(decisioninfo) decisioninfo, _, _, _ = env.step(action) ep_statex.append(env.s[0, 0]) ep_statey.append(env.s[0, 1]) ep_belifx.append(env.b[0, 0]) ep_belify.append(env.b[0, 1]) ep_data['x'] = ep_statex ep_data['y'] = ep_statey ep_data['bx'] = ep_belifx ep_data['by'] = ep_belify ep_data['goalx'] = env.goalx ep_data['goaly'] = env.goaly ep_data['theta'] = env.theta.tolist() # save episode data dict to all data all_ep.append(ep_data) for i in range(num_episode): plt.figure ep_xt = all_ep[i]['x'] ep_yt = all_ep[i]['y'] plt.title(str(['{:.2f}'.format(x) for x in all_ep[i]['theta']])) plt.plot(ep_xt, ep_yt, 'r-') plt.plot(all_ep[i]['bx'], all_ep[i]['by'], 'b-') # plt.scatter(all_ep[i]['goalx'],all_ep[i]['goaly']) circle = np.linspace(0, 2 * np.pi, 100) r = all_ep[i]['theta'][-1] x = r * np.cos(circle) + all_ep[i]['goalx'].item() y = r * np.sin(circle) + all_ep[i]['goaly'].item() plt.plot(x, y) plt.savefig('path.png')
def test_ddpg_popart(): """ Test DDPG with pop-art normalization """ n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, action_noise=action_noise, enable_popart=True) model.learn(1000)
def setup(model_params, output_folder_path): latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: print("Creating model...") model = DDPG(CnnPolicy, **model_params) else: print("Loading model...") model = DDPG.load(latest_model_path, **model_params) tensorboard_dir = (output_folder_path / "tensorboard") ckpt_dir = (output_folder_path / "checkpoints") tensorboard_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) checkpoint_callback = CheckpointCallback(save_freq=200, verbose=2, save_path=ckpt_dir.as_posix()) # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) logging_callback = LoggingCallback(model=model, verbose=1) callbacks = CallbackList([checkpoint_callback, logging_callback]) return model, callbacks
def train_identity_ddpg(): env = DummyVecEnv([lambda: IdentityEnvBox(eps = 0.5)]) std = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std)) model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6)) model.learn(total_timesteps=20000, seed=0) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials del model, env
def testing(env, name): model = DDPG.load("models\\ddpg_sbl_" + name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() if done or env.steps > 1000: env.reset()
def get_policy(name="ddpg"): """ Note: ppo requires the NeuralShield package in the docker. :param name: pretrained policy name :return: stable baselines policy """ if name == "ppo": return PPO2.load(get_dir_root() + "/pretrained/ppo.pkl") elif name == "ddpg": return DDPG.load(get_dir_root() + "/pretrained/ddpg.pkl")
def DDPGAgent(multi_stock_env, num_episodes): models_folder = 'saved_models' rewards_folder = 'saved_rewards' env = DummyVecEnv([lambda: multi_stock_env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Hyper parameters GAMMA = 0.99 TAU = 0.001 BATCH_SIZE = 16 ACTOR_LEARNING_RATE = 0.0001 CRITIC_LEARNING_RATE = 0.001 BUFFER_SIZE = 500 print("\nRunning DDPG Agent...\n") model = DDPG(MlpPolicy, env, gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE, actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE, buffer_size = BUFFER_SIZE, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=50000) model.save(f'{models_folder}/rl/ddpg.h5') del model model = DDPG.load(f'{models_folder}/rl/ddpg.h5') obs = env.reset() portfolio_value = [] for e in range(num_episodes): action, _states = model.predict(obs) next_state, reward, done, info = env.step(action) print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}") portfolio_value.append(round(info[0]['cur_val'], 3)) # save portfolio value for each episode np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value) print("\nDDPG Agent run complete and saved!") a = np.load(f'./saved_rewards/rl/ddpg.npy') print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}") plt.plot(a) plt.title("Portfolio Value Per Episode (DDPG)") plt.ylabel("Portfolio Value") plt.xlabel("Episodes") plt.show()
def __init__(self, agent: Agent, steering_boundary: Tuple[float, float], throttle_boundary: Tuple[float, float], **kwargs): super().__init__(agent, **kwargs) self.max_speed = self.agent.agent_settings.max_speed self.throttle_boundary = throttle_boundary self.steering_boundary = steering_boundary self.long_pid_controller = LongPIDController( agent=agent, throttle_boundary=throttle_boundary, max_speed=self.max_speed) self.lat_pid_controller = LatPIDController( agent=agent, steering_boundary=steering_boundary) self.logger = logging.getLogger(__name__) try: self.pid_rl_model = DDPG.load( Path("./ROAR_Sim/data/weights/rl_pid_model.zip")) except: path = Path(self.agent.kwargs['kwargs']["rl_pid_model_file_path"]) self.pid_rl_model = DDPG.load(load_path=path)
def main(argv): # -p fixed = False # -j numControlledJoints = 7 # -n policy_name = "pushing_policy" # COMMAND LINE PARAMS MANAGEMENT: try: opts, args = getopt.getopt(argv, "hj:p:n:", ["j=", "p=", "n="]) except getopt.GetoptError: print('test.py -j <numJoints> -p <fixedPoseObject> -p <policy_name> ') sys.exit(2) for opt, arg in opts: if opt == '-h': print('------------------ Default values:') print( 'test.py -j <numJoints: 7> -p <fixedPoseObject: False> -n <policy_name:"pushing_policy"> ' ) print('------------------') return 0 sys.exit() elif opt in ("-j", "--j"): if (numControlledJoints > 7): print("Check dimension state") return 0 else: numControlledJoints = int(arg) elif opt in ("-p", "--p"): fixed = bool(arg) elif opt in ("-n", "--n"): policy_name = str(arg) print(colored("-----Number Joints Controlled:", "red")) print(colored(numControlledJoints, "red")) print(colored("-----Object Position Fixed:", "red")) print(colored(fixed, "red")) print(colored("-----Policy Name:", "red")) print(colored(policy_name, "red")) print(colored("------", "red")) print(colored("Launch the script with -h for further info", "red")) model = DDPG.load(policy_name) pandaenv = pandaPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=True, useIK=0, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True) obs = pandaenv.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = pandaenv.step(action)
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, ddpg_config, total_time_steps, validate_every_timesteps, task_name): print("Using MPI for multiprocessing with {} workers".format( MPI.COMM_WORLD.Get_size())) rank = MPI.COMM_WORLD.Get_rank() print("Worker rank: {}".format(rank)) task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=0, max_episode_length=maximum_episode_length, normalize_actions=False, normalize_observations=False) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) policy_kwargs = dict(layers=[256, 256]) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = DDPG(MlpPolicy, env, verbose=2, param_noise=param_noise, action_noise=action_noise, policy_kwargs=policy_kwargs, **ddpg_config) model.learn(total_timesteps=total_time_steps, tb_log_name="ddpg", callback=checkpoint_callback) return
def load_env(model_name='flexible_load_first', seed=9): #flexible_load_first, overnight, larger_margin_cost, discount_06, flex50 location = 'C:\\Users\\vegar\\Dropbox\\Master\\thesis.git\\scripts\\models\\' params_name = model_name + '_params.p' model = DDPG.load(location + model_name) env = ActiveEnv(seed=seed) with open(location + params_name, 'rb') as f: params = pickle.load(f) env.set_parameters(params) model.set_env(env) return model, env
def main(): # create Environment env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=True, useIK=1, isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0) model = DDPG.load(os.path.join(log_dir,'final_model.pkl'), env=env) #model.learn(total_timesteps=10000) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=6000)