def _init_environment(self,datapath,window_size): df = pd.read_csv(datapath) bid_price_columns = [i for i in range(1,len(df.columns),20)] print(bid_price_columns) ask_price_columns = [i for i in range(3,len(df.columns),20)] bidPrices = df[df.columns[bid_price_columns]] askPrices = df[df.columns[bid_price_columns]] df_concat = pd.concat([bidPrices, askPrices]) midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):] print(midPrices[:,0]) self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)]) self.env = VecCheckNan(self.env, raise_exception=True) n_actions = self.env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) print(n_actions) if(self.policy == "DDPG"): self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise) elif(self.policy=="TD3"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) elif(self.policy=="GAIL"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) else: self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose)) if self.load: #load model self.model = self.model.load("save/"+modelpath+".h5") #init model class self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
def main(load_policy=True): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 gamma = 0.9 memory_limit = 1000000 timesteps = 15000000 discreteAction = 0 rend = False # learning rate env = bioEnv() env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) model.save("policy_TD3_Discr")
def parse_noise_types(noise_type, nb_actions): """ Parse noise types for policies """ action_noise = None param_noise = None for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) return action_noise, param_noise
def create_action_noise(env, noise_type): action_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) return action_noise
def train_agent_with_ddpg(load): from stable_baselines.ddpg.policies import FeedForwardPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG # Create and wrap the environment env = gym.make('F16GCAS-v0') env = DummyVecEnv([lambda: env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions)) # Custom MLP policy of two layers of size 16 each class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise) if not load: ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128") else: model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env) return model
def f_fwgym_get_action_noise(noise_dict, n_actions): if noise_dict['name'] == 'OrnsteinUhlenbeck': return OrnsteinUhlenbeckActionNoise( mean=float(noise_dict['mu']) * np.ones(n_actions), sigma=float(noise_dict['sigma']) * np.ones(n_actions)) else: raise RuntimeError(f"Unrecognized Noise Model {noise_dict['name']}")
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v1') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=200000) print("Saving model to pickbot_model_ddpg_continuous_" + timestamp + ".pkl") model.save("pickbot_model_ddpg_continuous_" + timestamp)
def main(argv): numControlledJoints = 6 fixed = False normalize_observations = False gamma = 0.9 batch_size = 16 memory_limit = 1000000 normalize_returns = True timesteps = 1000000 policy_name = "reaching_policy" discreteAction = 0 rend = False kukaenv = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True) kukaenv = Monitor(kukaenv, log_dir, allow_early_resets=True) n_actions = kukaenv.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_class = DDPG goal_selection_strategy = 'future' model = HER(CustomPolicy, kukaenv, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/kuka_reach_ddpg/reaching_DDPG_HER_PHASE", buffer_size=1000000, batch_size=64, random_exploration=0.3, action_noise=action_noise) print(colored("-----Timesteps:", "red")) print(colored(timesteps, "red")) print(colored("-----Number Joints Controlled:", "red")) print(colored(numControlledJoints, "red")) print(colored("-----Object Position Fixed:", "red")) print(colored(fixed, "red")) print(colored("-----Policy Name:", "red")) print(colored(policy_name, "red")) print(colored("------", "red")) print(colored("Launch the script with -h for further info", "red")) model.learn(total_timesteps=timesteps, log_interval=100, callback=callback) print("Saving model to kuka.pkl") model.save("../pybullet_logs/kukareach_ddpg_her/" + policy_name) del model # remove to demonstrate saving and loading
def train_policy_ddpg(env, policy, policy_args, total_timesteps, verbose=0, actor_lr=.5, critic_lr=.001): """ Parameters ---------- env : vectorized set of EncoderWrapper of a TimeLimit wrapper of a restartable env. policy : ddpg policy class policy_args : dict of keyword arguments for policy class total_timesteps : int, how many timesteps to train policy (i.e. 200000) """ # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(policy, env, verbose=verbose, param_noise=param_noise, action_noise=action_noise, policy_kwargs=policy_args, actor_lr=actor_lr, critic_lr=critic_lr) #model = PPO2(policy, env) model.learn(total_timesteps) return model
def run_test(config): """Stable baselines test Mandatory configuration settings: - 'continuous' agent - camera_settings enabled - stable_baselines enabled """ env = None try: # Create Environment env = make_env(config) env = DummyVecEnv([lambda: env]) # Initialize DDPG and start learning n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(CnnPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, random_exploration=0.8) model.learn(total_timesteps=10000) finally: if env: env.close() else: clear_carla(config.host, config.port) print("-----Carla Environment is closed-----")
def create_model(self, config_file=None, dataset=None, config_location=None, name=None): """ Creates a new RL Model """ self.name = name if config_file is None: args = dict(env_name=self.env_name) args['config_location'] = config_location c = self.config = get_parameters(**args) else: c = self.config = config_file self.n_steps = self.config['main']['n_steps'] self.create_env() model_name = c['main']['model'] model_params = c['models'][model_name] policy_name = c['main']['policy'] try: policy_params = c['policies'][policy_name] except: pass print('\nCreating {} model...'.format(model_name)) self.policy = self._get_policy(policy_name) model_object = getattr(stable_baselines, model_name) model_args = dict(policy=self.policy, env=self.env, tensorboard_log=self._env_path, **model_params) # DDPG Model creation if 'DDPG' in model_name: from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec, NormalActionNoise n_actions = self.env.action_space.shape[0] model_args['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) if 'Custom' in policy_name: if 'DQN' in model_name: self.policy = model_args['policy'] = self._get_policy( 'CustomDQNPolicy') model_args['policy_kwargs'] = { **c['policies']['CustomDQNPolicy'] } else: model_args['policy_kwargs'] = {'params': policy_params} self.model = model_object(**model_args) return self
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 7 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 8000000 rend = False obj_pose_rnd_std = 0 env = pandaPushGymGoalEnv(renders=rend, use_IK=0, numControlledJoints=action_space, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER( CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load( "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) model.learn(timesteps, log_interval=100, callback=callback) print("Saving Policy PHASE_1") model.save("../policies/TD3_phase1_target_fixed")
def train_TD3(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir,log_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir+'/', allow_early_resets=True) policy = kwargs['policy'] n_timesteps = kwargs['n_timesteps'] noise_type = kwargs['noise_type'] del kwargs['policy'] del kwargs['n_timesteps'] del kwargs['noise_type'] ''' Parameter space noise: injects randomness directly into the parameters of the agent, altering the types of decisions it makes such that they always fully depend on what the agent currently senses. ''' # the noise objects for TD3 nb_actions = env.action_space.shape[-1] action_noise = None if not noise_type is None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) if 'continue' in kwargs and kwargs['continue'] is True: # Continue training print("Loading pretrained agent") # Policy should not be changed del kwargs['policy'] model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir,'tb'), verbose=1, **kwargs) else: if 'continue' in kwargs: del kwargs['continue'] model = TD3(policy, env, action_noise=action_noise, seed=seed, verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
def main(): model_class = DDPG # works also with SAC and DDPG # -j action_space = 7 # -p fixed = True # -o normalize_observations = False # -g gamma = 0.9 # -b #batch_size = 16 # -m memory_limit = 1000000 # -r normalize_returns = True # -t timesteps = 1000000 policy_name = "pushing_policy" discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space=action_space, fixedPositionObj=fixed, includeVelObs=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER( CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/panda_push_ddpg/stable_baselines/DDPG+HER_FIXED_DYN_RAND", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps) print("Saving Policy") model.save("../policies/pushing_fixed_HER_Dyn_Rand")
def ppo1_nmileg_pool(sensory_value): RL_method = "PPO1" # total_MC_runs = 50 experiment_ID = "handtest_rot_pool_with_MC_C_task0/" save_name_extension = RL_method total_timesteps = 500000 sensory_info = "sensory_{}".format(sensory_value) current_mc_run_num =22 #starts from 0 for mc_cntr in range(current_mc_run_num, current_mc_run_num+1): log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info) # defining the environments env = gym.make('HandManipulate-v1{}'.format(sensory_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) ## setting the Monitor env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info") # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # setting the random seed for some of the random instances random_seed = mc_cntr random.seed(random_seed) env.seed(random_seed) env.action_space.seed(random_seed) np.random.seed(random_seed) tf.random.set_random_seed(random_seed) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir+"/model") return None
def _init_ddpg(self): # the noise objects for DDPG n_actions = self.env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), theta=float(0.6) * np.ones(n_actions), sigma=float(0.2) * np.ones(n_actions)) return DDPG( LnMlpPolicy, self.env, verbose=1, batch_size=self.ddpg_batch_size, clip_norm=5e-3, gamma=0.9, param_noise=None, action_noise=action_noise, memory_limit=self.ddpg_memory_size, nb_train_steps=self.ddpg_training_steps, )
def main(load_policy=False): global log_dir, log_dir_policy if (load_policy): log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS' model_class = TD3 # works also with SAC and DDPG action_space = 7 fixed = True normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 1500000 discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True) env = Monitor(env, log_dir, allow_early_resets=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps, callback = callback ) model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND") print("Finished train1")
def main(args): #Starting the timer to record the operation time. start = time.time() env_id = 'fwmav_hover-v0' #Creating a vector of size 1 which only has the environment. env = DummyVecEnv([make_env(env_id, 0)]) # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)]) # -1 argument means the shape will be found automatically. n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG( policy=MyDDPGPolicy, env=env, gamma=1.0, nb_train_steps=5000, nb_rollout_steps=10000, nb_eval_steps=10000, param_noise=param_noise, action_noise=action_noise, tau=0.003, batch_size=256, observation_range=(-np.inf, np.inf), actor_lr=0.0001, critic_lr=0.001, reward_scale=0.05, memory_limit=10000000, verbose=1, ) model.learn(total_timesteps=args.time_step) model.save(args.model_path) #End timer. end = time.time() print("Time used: ", end - start)
def run_baseline_ddpg(env_name, train=True): import numpy as np # from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG env = gym.make(env_name) env = DummyVecEnv([lambda: env]) if train: # mlp from stable_baselines.ddpg.policies import FeedForwardPolicy class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[64, 64, 64], layer_norm=True, feature_extraction="mlp") # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions)) model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high), critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000) model.learn(total_timesteps=1e5) model.save("checkpoints/ddpg_" + env_name) else: model = DDPG.load("checkpoints/ddpg_" + env_name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info) del model # remove to demonstrate saving and loading
def ppo1_nmileg_pool(stiffness_value): RL_method = "PPO1" experiment_ID = "experiment_4_pool_A/mc_1/" save_name_extension = RL_method total_timesteps = 500000 stiffness_value_str = "stiffness_{}".format(stiffness_value) log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value_str) # defining the environments env = gym.make('TSNMILeg{}-v1'.format(stiffness_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir + "/model") return None
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 fixed = True #0 completely fixed, 1 slightly random radius, 2 big random radius, object_position = 1 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 5000000 discreteAction = 0 rend = False env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True, object_position=object_position) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) print("Saving Policy PHASE_1") model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
def main(args): start = time.time() env_id = 'fwmav_maneuver-v0' env = DummyVecEnv([make_env(env_id, 0)]) # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG( policy=MyDDPGPolicy, env=env, gamma=1.0, nb_train_steps=5000, nb_rollout_steps=10000, nb_eval_steps=10000, param_noise=param_noise, action_noise=action_noise, tau=0.003, batch_size=256, observation_range=(-np.inf, np.inf), actor_lr=0.0001, critic_lr=0.001, reward_scale=0.05, memory_limit=10000000, verbose=1, ) model.learn(total_timesteps=args.time_step) model.save(args.model_path) end = time.time() print("Time used: ", end - start)
def init_model(gui=True): env = RobotSphereEnv(gui=gui) env = DummyVecEnv([lambda: env]) if AGENT is "PPO2": model = PPO2(MlpLstmPolicy, env, n_steps=4096, verbose=2, tensorboard_log="logs/" + AGENT + "Agent/" + datetime.now().strftime("%Y%m%d-%H%M%S")) if AGENT is "DDPG": action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(env.action_space.shape[-1]), sigma=float(0.5) * np.ones(env.action_space.shape[-1])) model = DDPG(DDPGMlpPolicy, env, verbose=2, param_noise=None, action_noise=action_noise, tensorboard_log="logs/" + AGENT + "Agent/" + datetime.now().strftime("%Y%m%d-%H%M%S")) return env, model
def train(self, args, callback, env_kwargs=None, train_kwargs=None): env = self.makeEnv(args, env_kwargs=env_kwargs) if train_kwargs is None: train_kwargs = {} # Parse noise_type action_noise = None param_noise = None n_actions = env.action_space.shape[-1] if args.noise_param: param_noise = AdaptiveParamNoiseSpec(initial_stddev=args.noise_param_sigma, desired_action_stddev=args.noise_param_sigma) if train_kwargs.get("noise_action", args.noise_action) == 'normal': action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=args.noise_action_sigma * np.ones(n_actions)) elif train_kwargs.get("noise_action", args.noise_action) == 'ou': action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=args.noise_action_sigma * np.ones(n_actions)) # filter the hyperparam, and set default values in case no hyperparam train_kwargs = {k: v for k, v in train_kwargs.items() if k not in ["noise_action_sigma", "noise_action"]} # get the associated policy for the architecture requested if args.srl_model == "raw_pixels": args.policy = "cnn" else: args.policy = "mlp" self.policy = args.policy self.ob_space = env.observation_space self.ac_space = env.action_space policy_fn = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[args.policy] param_kwargs = { "verbose": 1, "render_eval": False, "render": False, "reward_scale": 1., "param_noise": param_noise, "normalize_returns": False, "normalize_observations": (args.srl_model == "raw_pixels"), "critic_l2_reg": 1e-2, "actor_lr": 1e-4, "critic_lr": 1e-3, "action_noise": action_noise, "enable_popart": False, "gamma": 0.99, "clip_norm": None, "nb_train_steps": 100, "nb_rollout_steps": 100, "nb_eval_steps": 50, "batch_size": args.batch_size } self.model = self.model_class(policy_fn, env, **{**param_kwargs, **train_kwargs}) self.model.learn(total_timesteps=args.num_timesteps, seed=args.seed, callback=callback) env.close()
def train(training_tag): env = gym.make(ENVIRONMENT_NAME) env = DummyVecEnv([lambda: env]) data = pd.DataFrame() #env._max_episode_steps = 200 if(isinstance(training_tag, float)): model = CLAC(clac_MlpPolicy, env, mut_inf_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): #print("length normal: ", env.unwrapped.envs[0].length) (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "CLAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) file_tag = str(training_tag).replace(".", "p") if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model step = 0 model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "SAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) file_tag = str(training_tag).replace(".", "p") if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "CLAC"): model = CLAC(clac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "CLAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "CLAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "SAC"): model = SAC(sac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "SAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "SAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str( TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "DDPG"): # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=VERBOSITY, param_noise=param_noise, action_noise=action_noise, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) #data = data.append(learning_results, ignore_index=True) data = data.append(test(model, "DDPG", None, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "DDPG", None, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "DDPG", None, 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "PPO1"): model = PPO1(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) data = data.append(test(model, "PPO1", training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "PPO1", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "PPO1", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model if(training_tag == "A2C"): model = A2C(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS) for step in range(TRAINING_STEPS): model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100) data = data.append(test(model, "A2C", training_tag, False, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "A2C", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS)) data = data.append(test(model, "A2C", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS)) if(SAVE_AGENTS): model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if(SAVE_FINAL_AGENT): model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model return data
if not os.path.exists(log_dir): os.makedirs(log_dir) os.environ['CUDA_VISIBLE_DEVICES'] = '1' tstart = time.time() env = ToyEnv( train=True, log_dir=log_dir, ) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(env=env, policy=FeedForwardCust3Policy, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=int(5e6), callback=callback) model.save(log_dir + "last_model") print('Time taken: {:.2f}'.format(time.time() - tstart))
def main(_algo_name, _algo_tag, _tag_suffix, _save_freq, _lock_rotation, _eval_num, _eval_freq, hyperparams): rotation_tag = "_LOCKED_ROT_" if _lock_rotation else "_ROTATION_" full_tag = _algo_name + rotation_tag + _algo_tag + _tag_suffix current_dir = _algo_name + "/" + full_tag log_dir = current_dir + "/log/" eval_log_dir = current_dir + "/log/eval/" trained_models_dir = current_dir + "/models/" os.makedirs(log_dir, exist_ok=True) os.makedirs(eval_log_dir, exist_ok=True) os.makedirs(trained_models_dir, exist_ok=True) is_discrete = True if _algo_name == 'DQN' else False panda_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation, _is_discrete=is_discrete), log_dir)) eval_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation, _is_discrete=is_discrete), eval_log_dir)) callbacks = [] callbacks.append(CheckpointCallback(_save_freq, trained_models_dir)) if _save_freq > 0 else None callbacks.append(MeanHundredEpsTensorboardCallback(log_dir)) callbacks.append(StdHundredEpsTensorboardCallback(log_dir)) callbacks.append(SuccessRateTensorboardCallback(log_dir)) if _algo_name == 'DDPG': callbacks.append(SaveOnBestTrainingRewardCallback(10000, log_dir)) else: callbacks.append(EvalCallback(eval_env, best_model_save_path=trained_models_dir, log_path=log_dir, eval_freq=_eval_freq, deterministic=True, render=False, n_eval_episodes=_eval_num)) if _eval_freq > 0 else None time_steps = hyperparams.pop('n_timesteps') if hyperparams.get('n_timesteps') is not None else None param_noise = None action_noise = None if hyperparams.get('noise_type') is not None: noise_type = hyperparams.pop('noise_type').strip() if 'ornstein-uhlenbeck' in noise_type: n_actions = panda_env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.005) * np.ones(n_actions)) elif 'param_noise' in noise_type: param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # add action noise for DDPG or TD3, in DQN noise as flag already in hyperparams if _algo_name == 'DDPG' or _algo_name == 'TD3': hyperparams['action_noise'] = action_noise # add hyperparams specific only for DDPG if _algo_name == 'DDPG': hyperparams['param_noise'] = param_noise hyperparams['eval_env'] = eval_env model = ALGOS[_algo_name](env=panda_env, tensorboard_log="tensorboard/", n_cpu_tf_sess=None, **hyperparams) model.learn(total_timesteps=time_steps, callback=callbacks, tb_log_name=full_tag, log_interval=10) model.save(current_dir + "/" + full_tag + "_final")
def main(argv): # -j numControlledJoints = 7 # -p fixed = False # -o normalize_observations = False # -g gamma = 0.9 # -b batch_size = 128 # -m memory_limit = 1000000 # -r normalize_returns = True # -t timesteps = 10000000 policy_name = "pushing_policy" # COMMAND LINE PARAMS MANAGEMENT: try: opts, args = getopt.getopt(argv,"hj:p:g:b:m:r:o:t:n:",["j=","p=","g=","b=","m=","r=","o=","t=","n="]) except getopt.GetoptError: print ('train.py -t <timesteps> -j <numJoints> -p <fixedPoseObject> -n <policy_name> -g <gamma> -b <batchsize> -m <memory_limit> -r <norm_ret> -o <norm_obs> -p <policy_name>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('------------------ Default values:') print('train.py -t <timesteps: 10000000> -j <numJoints: 7> -p <fixedPoseObject: False> -n <policy_name:"pushing_policy"> -g <gamma: 0.9> -b <batch_size: 16> -m <memory_limit: 1000000> -r <norm_ret: True> -o <norm_obs: False> ') print('------------------') return 0 sys.exit() elif opt in ("-j", "--j"): if(numControlledJoints >7): print("check dim state") return 0 else: numControlledJoints = int(arg) elif opt in ("-p", "--p"): fixed = bool(arg) elif opt in ("-g", "--g"): gamma = float(arg) elif opt in ("-o", "--o"): normalize_observations = bool(arg) elif opt in ("-b", "--b"): batch_size = int(arg) elif opt in ("-m", "--m"): memory_limit = int(arg) elif opt in ("-r", "--r"): normalize_returns = bool(arg) elif opt in ("-t", "--t"): timesteps = int(arg) elif opt in ("-n","--n"): policy_name = str(arg) discreteAction = 0 rend = False pandaenv = pandaPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints = numControlledJoints, fixedPositionObj = fixed, includeVelObs = True) n_actions = pandaenv.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) pandaenv = DummyVecEnv([lambda: pandaenv]) model = DDPG(LnMlpPolicy, pandaenv,normalize_observations = normalize_observations, gamma=gamma,batch_size=batch_size, memory_limit=memory_limit, normalize_returns = normalize_returns, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log="../pybullet_logs/pandareach_ddpg/", reward_scale = 1) print(colored("-----Timesteps:","red")) print(colored(timesteps,"red")) print(colored("-----Number Joints Controlled:","red")) print(colored(numControlledJoints,"red")) print(colored("-----Object Position Fixed:","red")) print(colored(fixed,"red")) print(colored("-----Policy Name:","red")) print(colored(policy_name,"red")) print(colored("------","red")) print(colored("Launch the script with -h for further info","red")) model.learn(total_timesteps=timesteps) print("Saving model to panda.pkl") model.save("../pybullet_logs/pandareach_ddpg/policies"+ policy_name) del model # remove to demonstrate saving and loading
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
from stable_baselines.ddpg.policies import LnMlpPolicy from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec import numpy as np powerenv = ActiveEnv() powerenv.set_parameters({ 'state_space': ['sun', 'demand', 'imbalance'], 'reward_terms': ['voltage', 'current', 'imbalance'] }) powerenv = DummyVecEnv([lambda: powerenv]) action_mean = np.zeros(powerenv.action_space.shape) action_sigma = 0.3 * np.ones(powerenv.action_space.shape) action_noise = OrnsteinUhlenbeckActionNoise(mean=action_mean, sigma=action_sigma) param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.01) t_steps = 800000 logdir = 'C:\\Users\\vegar\\Dropbox\\Master\\logs' powermodel = DDPG( LnMlpPolicy, powerenv, verbose=2, action_noise=action_noise, gamma=0.99, #param_noise=param_noise, tensorboard_log=logdir, memory_limit=int(800000),