def train_agent_with_ddpg(load): from stable_baselines.ddpg.policies import FeedForwardPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG # Create and wrap the environment env = gym.make('F16GCAS-v0') env = DummyVecEnv([lambda: env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions)) # Custom MLP policy of two layers of size 16 each class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise) if not load: ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128") else: model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env) return model
def main(output_folder_path:Path): # Set gym-carla environment agent_config = AgentConfig.parse_file(Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file(Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLPIDAgent, "max_collision": 5 } env = gym.make('roar-pid-v0', params=params) env.reset() model_params: dict = { "verbose": 1, "render": True, "tensorboard_log": (output_folder_path / "tensorboard").as_posix() } latest_model_path = find_latest_model(output_folder_path) if latest_model_path is None: model = DDPG(LnMlpPolicy, env=env, **model_params) # full tensorboard log can take up space quickly else: model = DDPG.load(latest_model_path, env=env, **model_params) model.render = True model.tensorboard_log = (output_folder_path / "tensorboard").as_posix() logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=(output_folder_path / "checkpoints").as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList([checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"pid_ddpg_{datetime.now()}")
def test_ddpg_normalization(): """ Test that observations and returns normalizations are properly saved and loaded. """ param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=0.05) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, param_noise=param_noise) model.learn(1000) obs_rms_params = model.sess.run(model.obs_rms_params) ret_rms_params = model.sess.run(model.ret_rms_params) model.save('./test_ddpg.zip') loaded_model = DDPG.load('./test_ddpg.zip') obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params) ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params) for param, param_loaded in zip(obs_rms_params + ret_rms_params, obs_rms_params_2 + ret_rms_params_2): assert np.allclose(param, param_loaded) del model, loaded_model if os.path.exists("./test_ddpg.zip"): os.remove("./test_ddpg.zip")
def _get_weights(self): class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") DVenv = DummyVecEnv([lambda: self.env]) self.nn_model = DDPG.load(ROOT + "/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy) with self.nn_model.graph.as_default(): # print(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi')) # print(tf.all_variables()) # train_writer = tf.summary.FileWriter('./neural_network_graph', model.sess.graph) wb_list = self.nn_model.sess.run( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi')) self.w_list = [] self.b_list = [] count = 0 with tf.name_scope("neural_controller"): for wb in wb_list: if count % 2 == 0: self.w_list.append(tf.convert_to_tensor(wb, name="w")) else: self.b_list.append(tf.convert_to_tensor(wb, name="b")) count += 1
def test(model_path: str, exp_config: dict): test_env, _ = init_env(exp_config) if ALG == 'ddpg': model = DDPG.load(model_path, env=test_env) elif ALG == 'trpo': model = TRPO.load(model_path, env=test_env) elif ALG == 'ppo2': model = PPO2.load(model_path, env=test_env) elif ALG == 'her': # model = HER.load(model_path, env=test_env) raise NotImplemented() else: raise ValueError(f'Unknown algorithm "{ALG}"!') monitor = test_env.envs[0] # type: Monitor assert isinstance(monitor, Monitor) raw_env = monitor.unwrapped # type: GaussianPendulumEnv assert isinstance(raw_env, GaussianPendulumEnv) raw_env.configure(seed=42, mass_mean=(0.05, 1.5), mass_stdev=(0.01, 0.15), embed_knowledge=exp_config.get('embed_knowledge', False), perfect_knowledge=exp_config.get('perfect_knowledge', False), gym_env=test_env) runs = np.zeros((TEST_RUNS, 4)) fixed_masses = np.linspace(0.030, 1.600, TEST_RUNS) for test_ep in range(runs.shape[0]): obs = test_env.reset() if TEST_LINSPACE_MASS: p = raw_env.physical_props raw_env.physical_props = p[0], fixed_masses[test_ep], p[2] mass_distr_params = raw_env.mass_distr_params.copy() sampled_mass = raw_env.physical_props[1] while True: action, states = model.predict(obs, deterministic=True) obs, rewards, dones, info = test_env.step(action) rewards_by_episode = monitor.episode_rewards episode = len(rewards_by_episode) if episode != test_ep: break last_tot_reward = rewards_by_episode[-1] runs[test_ep, :] = mass_distr_params[0], mass_distr_params[ 1], sampled_mass, last_tot_reward avg_reward = runs[:, 3].mean() print(f'Avg. test reward: {avg_reward}\n') return runs
def test_build_chain(): chain_length = 1000 f16_model = F16TFModel(chain_length) # f16_model.sess.run([f16_model.roll_state_assign, f16_model.pull_state_assign, f16_model.done_state_assign], # feed_dict={f16_model.roll_state_placeholder: 2.00-.001, # f16_model.pull_state_placeholder: 3.27-.001, # f16_model.done_state_placeholder: 9.98-.001}) env = gym.make("F16GCAS-v0") ob = env.reset() x0 = env.states[-1] class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") DVenv = DummyVecEnv([lambda: f16_model.env]) model = DDPG.load(ROOT + "/trained_models/TDRL/f16/ddpg/128_128", env=DVenv, policy=CustomPolicy) assign_x0 = tf.assign(f16_model.x0, x0) f16_model.sess.run(assign_x0) f16_model.update_change_points() x_out = f16_model.sess.run(f16_model.xt_list[-1]) trace, reward = simulation_with_nn(env, chain_length, model, x0, mute=True) print("") print(x_out - env.states[-1])
def ddpg(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines import DDPG env = gym.make(env_id) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) if load_weights is not None: model = DDPG.load(load_weights, env=env) else: model = DDPG(policy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="ddpg", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback) save_model_weights(model, "ddpg", env_id, policy, seed=seed, path=".")
def load_model(path: str, env, desc: str): """ Loads a model from a stable baseline checkpoint file into a memory representation Args: path (str) : Path to the Stable Baseline Checkpoint File env (SB Env) : Path to the Stable Baseline Checkpoint File desc (str) : Text Description of what model this is Returns: The loaded model """ if desc == "ddpg": return DDPG.load(path, env) elif desc == "ppo": env = DummyVecEnv([lambda: env]) return PPO2.load(path, env) elif desc == "trpo": env = DummyVecEnv([lambda: env]) return TRPO.load(path, env) elif desc == "td3": return TD3.load(path, env) elif desc == "sac": return SAC.load(path, env) else: raise RuntimeError(f"Model Name {desc} not supported")
def my_compute_data(self, args, env, params, n_episodes): env = gym.make('gym_quadcopter:quadcopter-v' + str(args.env)) for alg, start_index, end_index, step, suffix in params: re_d = [] sr_d = [] rewards, s_rates = [], [] for i in range(start_index, end_index, step): print("") print( f"Working on alg={alg}, start_index={start_index}, end_index={end_index}, step={step}, suffix={suffix}, i={i}" ) path = f"{self.base_dir}models/{alg}/quadcopter-v{args.env}-{i}{suffix}.pkl" print(f"Evaluating model at {path}") if not os.path.exists(path): print(f"WARNING: File {path} does not exist --> SKIPPING") continue if alg == "ddpg": model = DDPG.load(path) elif alg == "ppo": model = PPO2.load(path) else: model = TRPO.load(path) r, su = mean_eval(n_episodes, model, env, False, False) print(f"Average Success Rate: {su}") rewards.append(r) s_rates.append(su[0]) i_max = np.argmax(s_rates) re_d.append(rewards) sr_d.append(s_rates) return re_d, sr_d
def test_DDPG(env, out_dir, seed=None, **kwargs): model = DDPG.load(os.path.join(out_dir, 'final_model.pkl'), env=env) #model.learn(total_timesteps=10000) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=5000) return
def plot_path_ddpg(modelname, env, num_episode=None): from stable_baselines import DDPG num_episode = 20 if num_episode is None else num_episode agent = DDPG.load(modelname, env=env) # create saving vars all_ep = [] # for ecah episode, for i in range(num_episode): ep_data = {} ep_statex = [] ep_statey = [] ep_belifx = [] ep_belify = [] # get goal position at start decisioninfo = env.reset() goalx = env.goalx goaly = env.goaly ep_data['goalx'] = goalx ep_data['goaly'] = goaly # log the actions raw, v and w while not env.stop: action, _ = agent.predict(decisioninfo) decisioninfo, _, _, _ = env.step(action) ep_statex.append(env.s[0, 0]) ep_statey.append(env.s[0, 1]) ep_belifx.append(env.b[0, 0]) ep_belify.append(env.b[0, 1]) ep_data['x'] = ep_statex ep_data['y'] = ep_statey ep_data['bx'] = ep_belifx ep_data['by'] = ep_belify ep_data['goalx'] = env.goalx ep_data['goaly'] = env.goaly ep_data['theta'] = env.theta.tolist() # save episode data dict to all data all_ep.append(ep_data) for i in range(num_episode): plt.figure ep_xt = all_ep[i]['x'] ep_yt = all_ep[i]['y'] plt.title(str(['{:.2f}'.format(x) for x in all_ep[i]['theta']])) plt.plot(ep_xt, ep_yt, 'r-') plt.plot(all_ep[i]['bx'], all_ep[i]['by'], 'b-') # plt.scatter(all_ep[i]['goalx'],all_ep[i]['goaly']) circle = np.linspace(0, 2 * np.pi, 100) r = all_ep[i]['theta'][-1] x = r * np.cos(circle) + all_ep[i]['goalx'].item() y = r * np.sin(circle) + all_ep[i]['goaly'].item() plt.plot(x, y) plt.savefig('path.png')
def get_policy(name="ddpg"): """ Note: ppo requires the NeuralShield package in the docker. :param name: pretrained policy name :return: stable baselines policy """ if name == "ppo": return PPO2.load(get_dir_root() + "/pretrained/ppo.pkl") elif name == "ddpg": return DDPG.load(get_dir_root() + "/pretrained/ddpg.pkl")
def __init__(self, agent: Agent, steering_boundary: Tuple[float, float], throttle_boundary: Tuple[float, float], **kwargs): super().__init__(agent, **kwargs) self.max_speed = self.agent.agent_settings.max_speed self.throttle_boundary = throttle_boundary self.steering_boundary = steering_boundary self.long_pid_controller = LongPIDController( agent=agent, throttle_boundary=throttle_boundary, max_speed=self.max_speed) self.lat_pid_controller = LatPIDController( agent=agent, steering_boundary=steering_boundary) self.logger = logging.getLogger(__name__) try: self.pid_rl_model = DDPG.load( Path("./ROAR_Sim/data/weights/rl_pid_model.zip")) except: path = Path(self.agent.kwargs['kwargs']["rl_pid_model_file_path"]) self.pid_rl_model = DDPG.load(load_path=path)
def testing(env, name): model = DDPG.load("models\\ddpg_sbl_" + name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() if done or env.steps > 1000: env.reset()
def main(argv): # -p fixed = False # -j numControlledJoints = 7 # -n policy_name = "pushing_policy" # COMMAND LINE PARAMS MANAGEMENT: try: opts, args = getopt.getopt(argv, "hj:p:n:", ["j=", "p=", "n="]) except getopt.GetoptError: print('test.py -j <numJoints> -p <fixedPoseObject> -p <policy_name> ') sys.exit(2) for opt, arg in opts: if opt == '-h': print('------------------ Default values:') print( 'test.py -j <numJoints: 7> -p <fixedPoseObject: False> -n <policy_name:"pushing_policy"> ' ) print('------------------') return 0 sys.exit() elif opt in ("-j", "--j"): if (numControlledJoints > 7): print("Check dimension state") return 0 else: numControlledJoints = int(arg) elif opt in ("-p", "--p"): fixed = bool(arg) elif opt in ("-n", "--n"): policy_name = str(arg) print(colored("-----Number Joints Controlled:", "red")) print(colored(numControlledJoints, "red")) print(colored("-----Object Position Fixed:", "red")) print(colored(fixed, "red")) print(colored("-----Policy Name:", "red")) print(colored(policy_name, "red")) print(colored("------", "red")) print(colored("Launch the script with -h for further info", "red")) model = DDPG.load(policy_name) pandaenv = pandaPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=True, useIK=0, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True) obs = pandaenv.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = pandaenv.step(action)
def main(): # create Environment env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=True, useIK=1, isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0) model = DDPG.load(os.path.join(log_dir,'final_model.pkl'), env=env) #model.learn(total_timesteps=10000) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=6000)
def f_checkpoints_range_2_mean_performance( self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]: logging.debug( f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}" ) rewards = np.zeros(len(checkpoints)) s_rates = np.zeros(len(checkpoints)) # Intent # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint # - Pass that model to `mean_eval` evaluation function which will evaluate the model on # - a certain number of episodes # - a certain env # - continuous or not continuous space # - an evaluation returns reward and average success rate # # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates j = 0 """ NOTE: i can range in anyway while j iterates over the numpy array """ for i in checkpoints: path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}" logging.debug(f"Evaluating model at {path}") if self.args.model['name'] == "ddpg": model = DDPG.load(path) elif self.args.model['name'] == "ppo": model = PPO2.load(path) elif self.args.model['name'] == "trpo": model = TRPO.load(path) elif self.args.model['name'] == "td3": model = TD3.load(path) elif self.args.model['name'] == "sac": model = SAC.load(path) logging.debug( f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}" ) rewards_list, success_rates_list = mean_eval( num_episodes=self.args.n_episodes, checkpoint_id=i, model=model, env=self.env, v=True, continuous=self.args.continuous, plots_dir=self.args.plots_dir) rewards_mean = np.mean(rewards_list) success_rates_mean = np.mean(success_rates_list) logging.debug( f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}" ) rewards[j] = rewards_mean s_rates[j] = success_rates_mean j += 1 return rewards, s_rates
def launchAgent(model_name: str): """ :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함 현재는 의도상 PPO2로 세팅할 것 :return: 1000회의 사이클을 돌고 난 이후의 모델 """ import Reinforcement_AI.env.e_enhanced_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env print("Current Env is " + model_name) if model_name == "HER": env = image_env.DetailedMiniMapEnv() model = HER("CnnPolicy", env=env, model_class=DQN) if model_name == "DDPG": env = image_env.DDPGImageEnv() model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True) if model_name == "PPO2": env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2(policy="CnnPolicy", env=env, verbose=1) else: env = image_env.DetailedMiniMapEnv() model = DQN( "CnnPolicy", # policy env=env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i), env) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i), env) if model_name == "PPO2": model = PPO2.load("detailedmap_PPO2_" + str(i), env) else: model = DQN.load("detailedmap_DQN_" + str(i), env) # print('model learn start') model.learn(total_timesteps=12500) #FPS가 130이상 넘어갈때의 최소수치 print("this model is : detailedmap_" + model_name + "_" + str(i + 1)) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i + 1)) del model # print('model save end') return model
def load_env(model_name='flexible_load_first', seed=9): #flexible_load_first, overnight, larger_margin_cost, discount_06, flex50 location = 'C:\\Users\\vegar\\Dropbox\\Master\\thesis.git\\scripts\\models\\' params_name = model_name + '_params.p' model = DDPG.load(location + model_name) env = ActiveEnv(seed=seed) with open(location + params_name, 'rb') as f: params = pickle.load(f) env.set_parameters(params) model.set_env(env) return model, env
def DDPGAgent(multi_stock_env, num_episodes): models_folder = 'saved_models' rewards_folder = 'saved_rewards' env = DummyVecEnv([lambda: multi_stock_env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Hyper parameters GAMMA = 0.99 TAU = 0.001 BATCH_SIZE = 16 ACTOR_LEARNING_RATE = 0.0001 CRITIC_LEARNING_RATE = 0.001 BUFFER_SIZE = 500 print("\nRunning DDPG Agent...\n") model = DDPG(MlpPolicy, env, gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE, actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE, buffer_size = BUFFER_SIZE, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=50000) model.save(f'{models_folder}/rl/ddpg.h5') del model model = DDPG.load(f'{models_folder}/rl/ddpg.h5') obs = env.reset() portfolio_value = [] for e in range(num_episodes): action, _states = model.predict(obs) next_state, reward, done, info = env.step(action) print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}") portfolio_value.append(round(info[0]['cur_val'], 3)) # save portfolio value for each episode np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value) print("\nDDPG Agent run complete and saved!") a = np.load(f'./saved_rewards/rl/ddpg.npy') print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}") plt.plot(a) plt.title("Portfolio Value Per Episode (DDPG)") plt.ylabel("Portfolio Value") plt.xlabel("Episodes") plt.show()
def main(output_folder_path: Path): # Set gym-carla environment agent_config = AgentConfig.parse_file( Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file( Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLLocalPlannerAgent, "max_collision": 5, } env = gym.make('roar-local-planner-v0', params=params) env.reset() model_params: dict = { "verbose": 1, "render": True, "env": env, "n_cpu_tf_sess": None, "buffer_size": 1000, "nb_train_steps": 50, "nb_rollout_steps": 100, # "nb_eval_steps": 50, "batch_size": 32, } latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: model = DDPG(CnnPolicy, **model_params) else: model = DDPG.load(latest_model_path, **model_params) tensorboard_dir = (output_folder_path / "tensorboard") ckpt_dir = (output_folder_path / "checkpoints") tensorboard_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) model.tensorboard_log = tensorboard_dir.as_posix() model.render = True logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=ckpt_dir.as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList( [checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"local_planner_ddpg_{datetime.now()}")
def main(argv): # -p fixed = False # -j numControlledJoints = 12 # -n policy_name = "models/DDPG/DDPG_16batch_false-norm-ret-ob_12Actions" # COMMAND LINE PARAMS MANAGEMENT: try: opts, args = getopt.getopt(argv,"hj:p:n:",["j=","p=","n="]) except getopt.GetoptError: print ('test.py -j <numJoints> -p <fixedPoseObject> -p <policy_name> ') sys.exit(2) for opt, arg in opts: if opt == '-h': print('------------------ Default values:') print('test.py -j <numJoints: 12> -p <fixedPoseObject: false> -n <policy_name:"DDPG_16batch_false-norm-ret-ob_12Actions"> ') print('------------------') return 0 sys.exit() elif opt in ("-j", "--j"): if(numControlledJoints >18): print("Check dimension state") return 0 else: numControlledJoints = int(arg) elif opt in ("-p", "--p"): fixed = bool(arg) elif opt in ("-n","--n"): policy_name = str(arg) print(colored("-----Number Joints Controlled:","red")) print(colored(numControlledJoints,"red")) print(colored("-----Object Position Fixed:","red")) print(colored(fixed,"red")) print(colored("-----Policy Name:","red")) print(colored(policy_name,"red")) print(colored("------","red")) print(colored("Launch the script with -h for further info","red")) model = DDPG.load(policy_name) bioenv = bioEnv() obs = bioenv.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = bioenv.step(action)
def load_env(model_name='flexible_load_first',seed=9): #flexible_load_first, overnight, larger_margin_cost, discount_06, flex50 model_path = os.path.join(MODEL_PATH,model_name) params_name = model_name +'_params.p' param_path = os.path.join(MODEL_PATH,params_name) try: model = DDPG.load(model_path) except: model = PPO1.load(model_path) env = ActiveEnv(seed=seed) with open(param_path,'rb') as f: params = pickle.load(f) env.set_parameters(params) model.set_env(env) return model, env
def train_stable_baselines(submodule, flags): """Train policies using the PPO algorithm in stable-baselines.""" from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import DDPG flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. print('Beginning training.') model = run_model_stablebaseline( flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') model = DDPG.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = True env = env_constructor(params=flow_params, version=0)() n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('the final reward is {}'.format(reward))
def main(output_folder_path: Path): # Set gym-carla environment agent_config = AgentConfig.parse_file( Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file( Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLLocalPlannerAgent, "max_collision": 5, } env = gym.make('roar-local-planner-v1', params=params) env.reset() tensorboard_dir, ckpt_dir = prep_dir(output_folder_path) model_params: dict = { "verbose": 1, "render": True, "env": env, "n_cpu_tf_sess": 2, "buffer_size": 10, "random_exploration": 0.1, "tensorboard_log": tensorboard_dir.as_posix(), } latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: model = DDPG( LnMlpPolicy, **model_params) # full tensorboard log can take up space quickly else: model = DDPG.load(latest_model_path, **model_params) logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=ckpt_dir.as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList( [checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"local_planner_v1_ddpg_{datetime.now()}")
def load_model(eval_env): model = DDPG.load('./ddpg_robot_env', env=eval_env) count = 0 step_num_arr = [] for _ in range(20): number_steps = 0 obs = eval_env.reset() for _ in range(400): action, _ = model.predict(obs) obs, reward, done, _ = eval_env.step(action) number_steps += 1 if done: step_num_arr.append(number_steps) count += 1 print("----------------It reached terminal state -------------------") break print("Robot reached the goal position successfully ", count, " times and the Average step count was ", np.average(np.array(step_num_arr)))
def view_ddpg(): env = gimbal(5, 500) model = DDPG.load("./models/baseline_ddpg_t2") success_rate = 0 reward_avg = 0 for episodes in range(50): obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() if dones: if rewards > -100: success_rate += 1 reward_avg += rewards break print("Success rate: ", success_rate, "Avg rewards: ", reward_avg / success_rate)
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v1') model = DDPG.load("pickbot_model_ddpg_continuous_2019-03-11 12:45:38") while True: obs, done = env.reset(), False action, _states = model.predict(obs) episode_rew = 0 while not done: obs, rewards, done, info = env.step(action) episode_rew += rewards print("Episode reward", episode_rew)
def setup(model_params, output_folder_path): latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: print("Creating model...") model = DDPG(CnnPolicy, **model_params) else: print("Loading model...") model = DDPG.load(latest_model_path, **model_params) tensorboard_dir = (output_folder_path / "tensorboard") ckpt_dir = (output_folder_path / "checkpoints") tensorboard_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) checkpoint_callback = CheckpointCallback(save_freq=200, verbose=2, save_path=ckpt_dir.as_posix()) # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) logging_callback = LoggingCallback(model=model, verbose=1) callbacks = CallbackList([checkpoint_callback, logging_callback]) return model, callbacks
def run_baseline_ddpg(env_name, train=True): import numpy as np # from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG env = gym.make(env_name) env = DummyVecEnv([lambda: env]) if train: # mlp from stable_baselines.ddpg.policies import FeedForwardPolicy class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[64, 64, 64], layer_norm=True, feature_extraction="mlp") # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions)) model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high), critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000) model.learn(total_timesteps=1e5) model.save("checkpoints/ddpg_" + env_name) else: model = DDPG.load("checkpoints/ddpg_" + env_name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info) del model # remove to demonstrate saving and loading