def _init_environment(self,datapath,window_size):

        df = pd.read_csv(datapath)
        bid_price_columns = [i for i in range(1,len(df.columns),20)]
        print(bid_price_columns)
        ask_price_columns = [i for i in range(3,len(df.columns),20)]
        bidPrices = df[df.columns[bid_price_columns]]
        askPrices = df[df.columns[bid_price_columns]]
        df_concat = pd.concat([bidPrices, askPrices])
        midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):]
        print(midPrices[:,0])

        self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)])
        self.env = VecCheckNan(self.env, raise_exception=True)

        n_actions = self.env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
        print(n_actions)

        if(self.policy == "DDPG"):
           self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise)
        elif(self.policy=="TD3"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        elif(self.policy=="GAIL"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        else:
            self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose))

        if self.load: #load model
            self.model = self.model.load("save/"+modelpath+".h5")

        #init model class
        self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
Esempio n. 2
0
def main(load_policy=True):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    gamma = 0.9
    memory_limit = 1000000
    timesteps = 15000000
    discreteAction = 0
    rend = False
    # learning rate


    env = bioEnv()
  
    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256,
                random_exploration=0.3, action_noise=action_noise)
    
    if (load_policy):
        model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/bioEnv_TD3",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)
    
    model.learn(timesteps,log_interval=100, callback = callback)
   
    model.save("policy_TD3_Discr")
Esempio n. 3
0
def parse_noise_types(noise_type, nb_actions):
    """
    Parse noise types for policies
    """
    action_noise = None
    param_noise = None
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    return action_noise, param_noise
Esempio n. 4
0
def create_action_noise(env, noise_type):
    action_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
            # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    return action_noise
Esempio n. 5
0
def train_agent_with_ddpg(load):
    from stable_baselines.ddpg.policies import FeedForwardPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    # Create and wrap the environment
    env = gym.make('F16GCAS-v0')
    env = DummyVecEnv([lambda: env])

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions))

    # Custom MLP policy of two layers of size 16 each
    class CustomPolicy(FeedForwardPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128],
                                               layer_norm=False,
                                               feature_extraction="mlp")

    model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise)

    if not load:
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
        model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128")
    else:
        model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env)

    return model
Esempio n. 6
0
def f_fwgym_get_action_noise(noise_dict, n_actions):
    if noise_dict['name'] == 'OrnsteinUhlenbeck':
        return OrnsteinUhlenbeckActionNoise(
            mean=float(noise_dict['mu']) * np.ones(n_actions),
            sigma=float(noise_dict['sigma']) * np.ones(n_actions))
    else:
        raise RuntimeError(f"Unrecognized Noise Model {noise_dict['name']}")
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v1')

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(MlpPolicy,
                 env,
                 verbose=1,
                 param_noise=param_noise,
                 action_noise=action_noise)
    model.learn(total_timesteps=200000)

    print("Saving model to pickbot_model_ddpg_continuous_" + timestamp +
          ".pkl")
    model.save("pickbot_model_ddpg_continuous_" + timestamp)
Esempio n. 8
0
def main(argv):

    numControlledJoints = 6
    fixed = False
    normalize_observations = False
    gamma = 0.9
    batch_size = 16
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1000000
    policy_name = "reaching_policy"
    discreteAction = 0
    rend = False

    kukaenv = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 numControlledJoints=numControlledJoints,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)
    kukaenv = Monitor(kukaenv, log_dir, allow_early_resets=True)

    n_actions = kukaenv.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    model_class = DDPG
    goal_selection_strategy = 'future'
    model = HER(CustomPolicy,
                kukaenv,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1,
                tensorboard_log=
                "../pybullet_logs/kuka_reach_ddpg/reaching_DDPG_HER_PHASE",
                buffer_size=1000000,
                batch_size=64,
                random_exploration=0.3,
                action_noise=action_noise)

    print(colored("-----Timesteps:", "red"))
    print(colored(timesteps, "red"))
    print(colored("-----Number Joints Controlled:", "red"))
    print(colored(numControlledJoints, "red"))
    print(colored("-----Object Position Fixed:", "red"))
    print(colored(fixed, "red"))
    print(colored("-----Policy Name:", "red"))
    print(colored(policy_name, "red"))
    print(colored("------", "red"))
    print(colored("Launch the script with -h for further info", "red"))

    model.learn(total_timesteps=timesteps, log_interval=100, callback=callback)

    print("Saving model to kuka.pkl")
    model.save("../pybullet_logs/kukareach_ddpg_her/" + policy_name)

    del model  # remove to demonstrate saving and loading
def train_policy_ddpg(env,
                      policy,
                      policy_args,
                      total_timesteps,
                      verbose=0,
                      actor_lr=.5,
                      critic_lr=.001):
    """
    Parameters
    ----------
    env : vectorized set of EncoderWrapper of a TimeLimit wrapper of a restartable env.
    policy : ddpg policy class
    policy_args : dict of keyword arguments for policy class
    total_timesteps : int, how many timesteps to train policy (i.e. 200000)
    """
    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(policy,
                 env,
                 verbose=verbose,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 policy_kwargs=policy_args,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr)
    #model = PPO2(policy, env)
    model.learn(total_timesteps)
    return model
Esempio n. 10
0
def run_test(config):
    """Stable baselines test

    Mandatory configuration settings:
        - 'continuous' agent
        - camera_settings enabled
        - stable_baselines enabled
    """
    env = None
    try:
        # Create Environment
        env = make_env(config)
        env = DummyVecEnv([lambda: env])

        # Initialize DDPG and start learning
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
        model = DDPG(CnnPolicy, env, verbose=1, param_noise=param_noise,
                     action_noise=action_noise, random_exploration=0.8)
        model.learn(total_timesteps=10000)

    finally:
        if env:
            env.close()
        else:
            clear_carla(config.host, config.port)
        print("-----Carla Environment is closed-----")
Esempio n. 11
0
    def create_model(self,
                     config_file=None,
                     dataset=None,
                     config_location=None,
                     name=None):
        """
        Creates a new RL Model
        """

        self.name = name
        if config_file is None:
            args = dict(env_name=self.env_name)
            args['config_location'] = config_location
            c = self.config = get_parameters(**args)
        else:
            c = self.config = config_file

        self.n_steps = self.config['main']['n_steps']
        self.create_env()

        model_name = c['main']['model']
        model_params = c['models'][model_name]
        policy_name = c['main']['policy']
        try:
            policy_params = c['policies'][policy_name]
        except:
            pass
        print('\nCreating {} model...'.format(model_name))

        self.policy = self._get_policy(policy_name)
        model_object = getattr(stable_baselines, model_name)

        model_args = dict(policy=self.policy,
                          env=self.env,
                          tensorboard_log=self._env_path,
                          **model_params)

        # DDPG Model creation
        if 'DDPG' in model_name:
            from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec, NormalActionNoise
            n_actions = self.env.action_space.shape[0]
            model_args['action_noise'] = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions),
                sigma=float(0.5) * np.ones(n_actions))

        if 'Custom' in policy_name:
            if 'DQN' in model_name:
                self.policy = model_args['policy'] = self._get_policy(
                    'CustomDQNPolicy')
                model_args['policy_kwargs'] = {
                    **c['policies']['CustomDQNPolicy']
                }
            else:
                model_args['policy_kwargs'] = {'params': policy_params}

        self.model = model_object(**model_args)

        return self
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 8000000
    rend = False

    obj_pose_rnd_std = 0

    env = pandaPushGymGoalEnv(renders=rend,
                              use_IK=0,
                              numControlledJoints=action_space,
                              obj_pose_rnd_std=obj_pose_rnd_std,
                              includeVelObs=True)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomTD3Policy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    if (load_policy):
        model = HER.load(
            "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl",
            env=env,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            tensorboard_log=
            "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
            buffer_size=1000000,
            batch_size=256,
            random_exploration=0.3,
            action_noise=action_noise)

    model.learn(timesteps, log_interval=100, callback=callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/TD3_phase1_target_fixed")
Esempio n. 13
0
def train_TD3(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir,log_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = Monitor(env, log_dir+'/', allow_early_resets=True)

    policy = kwargs['policy']
    n_timesteps = kwargs['n_timesteps']
    noise_type = kwargs['noise_type']
    del kwargs['policy']
    del kwargs['n_timesteps']
    del kwargs['noise_type']

    ''' Parameter space noise:
    injects randomness directly into the parameters of the agent, altering the types of decisions it makes
    such that they always fully depend on what the agent currently senses. '''

    # the noise objects for TD3
    nb_actions = env.action_space.shape[-1]
    action_noise = None
    if not noise_type is None:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    if 'continue' in kwargs and kwargs['continue'] is True:
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        del kwargs['policy']
        model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env,
                         tensorboard_log=os.path.join(log_dir,'tb'), verbose=1, **kwargs)
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        model = TD3(policy, env, action_noise=action_noise, seed=seed,
                verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False, **kwargs)

    model.learn(total_timesteps=n_timesteps, callback=log_callback)

    return model
Esempio n. 14
0
def main():
    model_class = DDPG  # works also with SAC and DDPG

    # -j
    action_space = 7
    # -p
    fixed = True
    # -o
    normalize_observations = False
    # -g
    gamma = 0.9
    # -b
    #batch_size = 16
    # -m
    memory_limit = 1000000
    # -r
    normalize_returns = True
    # -t
    timesteps = 1000000
    policy_name = "pushing_policy"
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 action_space=action_space,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomPolicy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_ddpg/stable_baselines/DDPG+HER_FIXED_DYN_RAND",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps)
    print("Saving Policy")
    model.save("../policies/pushing_fixed_HER_Dyn_Rand")
Esempio n. 15
0
def ppo1_nmileg_pool(sensory_value):
	RL_method = "PPO1" 
	# total_MC_runs = 50
	experiment_ID = "handtest_rot_pool_with_MC_C_task0/"
	save_name_extension = RL_method
	total_timesteps =  500000
	sensory_info = "sensory_{}".format(sensory_value) 
	current_mc_run_num =22 #starts from 0
	for mc_cntr in range(current_mc_run_num, current_mc_run_num+1):
		log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info)
		# defining the environments
		env = gym.make('HandManipulate-v1{}'.format(sensory_value))
		#env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True)
		## setting the Monitor
		env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info")
		# defining the initial model
		if RL_method == "PPO1":
			model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "PPO2":
			env = DummyVecEnv([lambda: env])
			model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "DDPG":
			env = DummyVecEnv([lambda: env])
			n_actions = env.action_space.shape[-1]
			param_noise = None
			action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions))
			model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir)
		else:
			raise ValueError("Invalid RL mode")
		# setting the environment on the model
		#model.set_env(env)
		# setting the random seed for some of the random instances
		random_seed = mc_cntr
		random.seed(random_seed)
		env.seed(random_seed)
		env.action_space.seed(random_seed)
		np.random.seed(random_seed)
		tf.random.set_random_seed(random_seed)
		# training the model
		# training the model
		model.learn(total_timesteps=total_timesteps)
		# saving the trained model
		model.save(log_dir+"/model")
	return None
Esempio n. 16
0
    def _init_ddpg(self):
        # the noise objects for DDPG
        n_actions = self.env.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions),
            theta=float(0.6) * np.ones(n_actions),
            sigma=float(0.2) * np.ones(n_actions))

        return DDPG(
            LnMlpPolicy,
            self.env,
            verbose=1,
            batch_size=self.ddpg_batch_size,
            clip_norm=5e-3,
            gamma=0.9,
            param_noise=None,
            action_noise=action_noise,
            memory_limit=self.ddpg_memory_size,
            nb_train_steps=self.ddpg_training_steps,
        )
def main(load_policy=False):

    global log_dir, log_dir_policy
    if (load_policy):
          log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS'
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    fixed = True
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1500000
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True)


    env = Monitor(env, log_dir, allow_early_resets=True)
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps, callback = callback )
    model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND")
    print("Finished train1")
Esempio n. 18
0
def main(args):
    #Starting the timer to record the operation time.
    start = time.time()

    env_id = 'fwmav_hover-v0'
    #Creating a vector of size 1 which only has the environment.
    env = DummyVecEnv([make_env(env_id, 0)])
    # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)])

    # -1 argument means the shape will be found automatically.
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(
        policy=MyDDPGPolicy,
        env=env,
        gamma=1.0,
        nb_train_steps=5000,
        nb_rollout_steps=10000,
        nb_eval_steps=10000,
        param_noise=param_noise,
        action_noise=action_noise,
        tau=0.003,
        batch_size=256,
        observation_range=(-np.inf, np.inf),
        actor_lr=0.0001,
        critic_lr=0.001,
        reward_scale=0.05,
        memory_limit=10000000,
        verbose=1,
    )

    model.learn(total_timesteps=args.time_step)
    model.save(args.model_path)

    #End timer.
    end = time.time()
    print("Time used: ", end - start)
def run_baseline_ddpg(env_name, train=True):
    import numpy as np
    # from stable_baselines.ddpg.policies import MlpPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    env = gym.make(env_name)
    env = DummyVecEnv([lambda: env])

    if train:
        # mlp
        from stable_baselines.ddpg.policies import FeedForwardPolicy
        class CustomPolicy(FeedForwardPolicy):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                layers=[64, 64, 64],
                                                layer_norm=True,
                                                feature_extraction="mlp")

        # the noise objects for DDPG
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions))
        model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, 
            tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high),
            critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000)
        model.learn(total_timesteps=1e5)
        model.save("checkpoints/ddpg_" + env_name)

    else:
        model = DDPG.load("checkpoints/ddpg_" + env_name)

        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)

    del model # remove to demonstrate saving and loading
def ppo1_nmileg_pool(stiffness_value):
    RL_method = "PPO1"
    experiment_ID = "experiment_4_pool_A/mc_1/"
    save_name_extension = RL_method
    total_timesteps = 500000
    stiffness_value_str = "stiffness_{}".format(stiffness_value)
    log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method,
                                        stiffness_value_str)
    # defining the environments
    env = gym.make('TSNMILeg{}-v1'.format(stiffness_value))
    #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True)
    # defining the initial model
    if RL_method == "PPO1":
        model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
    elif RL_method == "PPO2":
        env = DummyVecEnv([lambda: env])
        model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
    elif RL_method == "DDPG":
        env = DummyVecEnv([lambda: env])
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) * 5 *
                                                    np.ones(n_actions))
        model = DDPG(DDPG_MlpPolicy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     tensorboard_log=log_dir)
    else:
        raise ValueError("Invalid RL mode")
    # setting the environment on the model
    #model.set_env(env)
    # training the model
    # training the model
    model.learn(total_timesteps=total_timesteps)
    # saving the trained model
    model.save(log_dir + "/model")
    return None
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    fixed = True
    #0 completely fixed, 1 slightly random radius, 2 big random radius,
    object_position = 1
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 5000000
    discreteAction = 0
    rend = False

    env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True, object_position=object_position)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    model.learn(timesteps,log_interval=100, callback = callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
Esempio n. 22
0
def main(args):

    start = time.time()

    env_id = 'fwmav_maneuver-v0'
    env = DummyVecEnv([make_env(env_id, 0)])
    # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)])

    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(
        policy=MyDDPGPolicy,
        env=env,
        gamma=1.0,
        nb_train_steps=5000,
        nb_rollout_steps=10000,
        nb_eval_steps=10000,
        param_noise=param_noise,
        action_noise=action_noise,
        tau=0.003,
        batch_size=256,
        observation_range=(-np.inf, np.inf),
        actor_lr=0.0001,
        critic_lr=0.001,
        reward_scale=0.05,
        memory_limit=10000000,
        verbose=1,
    )

    model.learn(total_timesteps=args.time_step)
    model.save(args.model_path)

    end = time.time()
    print("Time used: ", end - start)
Esempio n. 23
0
def init_model(gui=True):
    env = RobotSphereEnv(gui=gui)
    env = DummyVecEnv([lambda: env])
    if AGENT is "PPO2":
        model = PPO2(MlpLstmPolicy,
                     env,
                     n_steps=4096,
                     verbose=2,
                     tensorboard_log="logs/" + AGENT + "Agent/" +
                     datetime.now().strftime("%Y%m%d-%H%M%S"))
    if AGENT is "DDPG":
        action_noise = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(env.action_space.shape[-1]),
            sigma=float(0.5) * np.ones(env.action_space.shape[-1]))

        model = DDPG(DDPGMlpPolicy,
                     env,
                     verbose=2,
                     param_noise=None,
                     action_noise=action_noise,
                     tensorboard_log="logs/" + AGENT + "Agent/" +
                     datetime.now().strftime("%Y%m%d-%H%M%S"))
    return env, model
Esempio n. 24
0
File: ddpg.py Progetto: s206283/gcrl
    def train(self, args, callback, env_kwargs=None, train_kwargs=None):
        env = self.makeEnv(args, env_kwargs=env_kwargs)

        if train_kwargs is None:
            train_kwargs = {}

        # Parse noise_type
        action_noise = None
        param_noise = None
        n_actions = env.action_space.shape[-1]
        if args.noise_param:
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=args.noise_param_sigma,
                                                 desired_action_stddev=args.noise_param_sigma)

        if train_kwargs.get("noise_action", args.noise_action) == 'normal':
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=args.noise_action_sigma * np.ones(n_actions))
        elif train_kwargs.get("noise_action", args.noise_action) == 'ou':
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=args.noise_action_sigma * np.ones(n_actions))

        # filter the hyperparam, and set default values in case no hyperparam
        train_kwargs = {k: v for k, v in train_kwargs.items() if k not in ["noise_action_sigma", "noise_action"]}

        # get the associated policy for the architecture requested
        if args.srl_model == "raw_pixels":
            args.policy = "cnn"
        else:
            args.policy = "mlp"

        self.policy = args.policy
        self.ob_space = env.observation_space
        self.ac_space = env.action_space

        policy_fn = {'cnn': CnnPolicy,
                     'mlp': MlpPolicy}[args.policy]

        param_kwargs = {
            "verbose": 1,
            "render_eval": False,
            "render": False,
            "reward_scale": 1.,
            "param_noise": param_noise,
            "normalize_returns": False,
            "normalize_observations": (args.srl_model == "raw_pixels"),
            "critic_l2_reg": 1e-2,
            "actor_lr": 1e-4,
            "critic_lr": 1e-3,
            "action_noise": action_noise,
            "enable_popart": False,
            "gamma": 0.99,
            "clip_norm": None,
            "nb_train_steps": 100,
            "nb_rollout_steps": 100,
            "nb_eval_steps": 50,
            "batch_size": args.batch_size
        }

        self.model = self.model_class(policy_fn, env, **{**param_kwargs, **train_kwargs})
        self.model.learn(total_timesteps=args.num_timesteps, seed=args.seed, callback=callback)
        env.close()
Esempio n. 25
0
def train(training_tag):
    env = gym.make(ENVIRONMENT_NAME)
    env = DummyVecEnv([lambda: env]) 
    data = pd.DataFrame()
    #env._max_episode_steps = 200

    if(isinstance(training_tag, float)):
        model = CLAC(clac_MlpPolicy, env, mut_inf_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)
        
        for step in range(TRAINING_STEPS):
            #print("length normal: ", env.unwrapped.envs[0].length)

            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)
            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "CLAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            file_tag = str(training_tag).replace(".", "p")
            if(SAVE_AGENTS):   
                model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model
        step = 0
        
        
        model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)
        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)
            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "SAC" + str(training_tag), training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC" + str(training_tag), training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC" + str(training_tag), training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            file_tag = str(training_tag).replace(".", "p")
            if(SAVE_AGENTS):   
                model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))
        
        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model
        

    if(training_tag == "CLAC"):
        model = CLAC(clac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)
            
            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "CLAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "CLAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS))

            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/CLAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS *  TRAINING_TIMESTEPS))

        env.reset()
        del model
    
    if(training_tag == "SAC"):
        model = SAC(sac_MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "SAC", "auto", False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC", "auto", 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "SAC", "auto", 2, (step + 1) * TRAINING_TIMESTEPS))

            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/SAC_" + ENVIRONMENT_NAME + "_t" + "_auto" + "_i" + str(CURRENT_ITERATION) + "_ts" + str( TRAINING_STEPS *  TRAINING_TIMESTEPS))

        env.reset()
        del model
    
    if(training_tag == "DDPG"):
        # the noise objects for DDPG
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

        model = DDPG(DDPG_MlpPolicy, env, verbose=VERBOSITY, param_noise=param_noise, action_noise=action_noise, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            (model, learning_results) = model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            #data = data.append(learning_results, ignore_index=True)

            data = data.append(test(model, "DDPG", None, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "DDPG", None, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "DDPG", None, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/DDPG_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS *  TRAINING_TIMESTEPS))

        env.reset()
        del model

    if(training_tag == "PPO1"):
        model = PPO1(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            data = data.append(test(model, "PPO1", training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "PPO1", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "PPO1", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/PPO1_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model
    
    if(training_tag == "A2C"):
        model = A2C(MlpPolicy, env, verbose=VERBOSITY, policy_kwargs = POLICY_KWARGS)

        for step in range(TRAINING_STEPS):
            model.learn(total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            data = data.append(test(model, "A2C", training_tag, False, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "A2C", training_tag, 1, (step + 1) * TRAINING_TIMESTEPS))
            data = data.append(test(model, "A2C", training_tag, 2, (step + 1) * TRAINING_TIMESTEPS))
            
            if(SAVE_AGENTS):
                model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_s" + str(step) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS))

        if(SAVE_FINAL_AGENT):
            model.save(SAVE_FOLDER + "/models/A2C_" + ENVIRONMENT_NAME + "_t" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model

    return data
Esempio n. 26
0
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
tstart = time.time()

env = ToyEnv(
    train=True,
    log_dir=log_dir,
)
env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=float(0.5) *
                                            np.ones(n_actions))

model = DDPG(env=env,
             policy=FeedForwardCust3Policy,
             verbose=1,
             param_noise=param_noise,
             action_noise=action_noise)

model.learn(total_timesteps=int(5e6), callback=callback)
model.save(log_dir + "last_model")

print('Time taken: {:.2f}'.format(time.time() - tstart))
Esempio n. 27
0
def main(_algo_name, _algo_tag, _tag_suffix, _save_freq, _lock_rotation, _eval_num, _eval_freq, hyperparams):
    rotation_tag = "_LOCKED_ROT_" if _lock_rotation else "_ROTATION_"
    full_tag = _algo_name + rotation_tag + _algo_tag + _tag_suffix
    current_dir = _algo_name + "/" + full_tag
    log_dir = current_dir + "/log/"
    eval_log_dir = current_dir + "/log/eval/"
    trained_models_dir = current_dir + "/models/"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(eval_log_dir, exist_ok=True)
    os.makedirs(trained_models_dir, exist_ok=True)

    is_discrete = True if _algo_name == 'DQN' else False

    panda_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation,
                                                                _is_discrete=is_discrete), log_dir))
    eval_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation,
                                                               _is_discrete=is_discrete), eval_log_dir))

    callbacks = []
    callbacks.append(CheckpointCallback(_save_freq, trained_models_dir)) if _save_freq > 0 else None
    callbacks.append(MeanHundredEpsTensorboardCallback(log_dir))
    callbacks.append(StdHundredEpsTensorboardCallback(log_dir))
    callbacks.append(SuccessRateTensorboardCallback(log_dir))
    if _algo_name == 'DDPG':
        callbacks.append(SaveOnBestTrainingRewardCallback(10000, log_dir))
    else:
        callbacks.append(EvalCallback(eval_env,
                                      best_model_save_path=trained_models_dir,
                                      log_path=log_dir,
                                      eval_freq=_eval_freq,
                                      deterministic=True,
                                      render=False,
                                      n_eval_episodes=_eval_num)) if _eval_freq > 0 else None

    time_steps = hyperparams.pop('n_timesteps') if hyperparams.get('n_timesteps') is not None else None

    param_noise = None
    action_noise = None
    if hyperparams.get('noise_type') is not None:
        noise_type = hyperparams.pop('noise_type').strip()
        if 'ornstein-uhlenbeck' in noise_type:
            n_actions = panda_env.action_space.shape[-1]
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=float(0.005) * np.ones(n_actions))
        elif 'param_noise' in noise_type:
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)

    # add action noise for DDPG or TD3, in DQN noise as flag already in hyperparams
    if _algo_name == 'DDPG' or _algo_name == 'TD3':
        hyperparams['action_noise'] = action_noise

    # add hyperparams specific only for DDPG
    if _algo_name == 'DDPG':
        hyperparams['param_noise'] = param_noise
        hyperparams['eval_env'] = eval_env

    model = ALGOS[_algo_name](env=panda_env,
                              tensorboard_log="tensorboard/",
                              n_cpu_tf_sess=None,
                              **hyperparams)

    model.learn(total_timesteps=time_steps,
                callback=callbacks,
                tb_log_name=full_tag,
                log_interval=10)

    model.save(current_dir + "/" + full_tag + "_final")
Esempio n. 28
0
def main(argv):

    # -j
    numControlledJoints = 7
    # -p
    fixed = False
    # -o
    normalize_observations = False
    # -g
    gamma = 0.9
    # -b
    batch_size = 128
    # -m
    memory_limit = 1000000
    # -r
    normalize_returns = True
    # -t
    timesteps = 10000000

    policy_name = "pushing_policy"

    # COMMAND LINE PARAMS MANAGEMENT:
    try:
        opts, args = getopt.getopt(argv,"hj:p:g:b:m:r:o:t:n:",["j=","p=","g=","b=","m=","r=","o=","t=","n="])
    except getopt.GetoptError:
        print ('train.py -t <timesteps> -j <numJoints> -p <fixedPoseObject> -n <policy_name> -g <gamma> -b <batchsize> -m <memory_limit> -r <norm_ret> -o <norm_obs> -p <policy_name>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('------------------ Default values:')
            print('train.py -t <timesteps: 10000000> -j <numJoints: 7> -p <fixedPoseObject: False> -n <policy_name:"pushing_policy"> -g <gamma: 0.9> -b <batch_size: 16> -m <memory_limit: 1000000> -r <norm_ret: True> -o <norm_obs: False> ')
            print('------------------')
            return 0
            sys.exit()
        elif opt in ("-j", "--j"):
            if(numControlledJoints >7):
                print("check dim state")
                return 0
            else:
                numControlledJoints = int(arg)
        elif opt in ("-p", "--p"):
            fixed = bool(arg)
        elif opt in ("-g", "--g"):
            gamma = float(arg)
        elif opt in ("-o", "--o"):
            normalize_observations = bool(arg)
        elif opt in ("-b", "--b"):
            batch_size = int(arg)
        elif opt in ("-m", "--m"):
            memory_limit = int(arg)
        elif opt in ("-r", "--r"):
            normalize_returns = bool(arg)
        elif opt in ("-t", "--t"):
            timesteps = int(arg)
        elif opt in ("-n","--n"):
            policy_name = str(arg)


    discreteAction = 0
    rend = False
    pandaenv = pandaPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
        isDiscrete=discreteAction, numControlledJoints = numControlledJoints,
        fixedPositionObj = fixed, includeVelObs = True)

    n_actions = pandaenv.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))


    pandaenv = DummyVecEnv([lambda: pandaenv])

    model = DDPG(LnMlpPolicy, pandaenv,normalize_observations = normalize_observations, gamma=gamma,batch_size=batch_size,
                memory_limit=memory_limit, normalize_returns = normalize_returns, verbose=1, param_noise=param_noise,
                action_noise=action_noise, tensorboard_log="../pybullet_logs/pandareach_ddpg/", reward_scale = 1)

    print(colored("-----Timesteps:","red"))
    print(colored(timesteps,"red"))
    print(colored("-----Number Joints Controlled:","red"))
    print(colored(numControlledJoints,"red"))
    print(colored("-----Object Position Fixed:","red"))
    print(colored(fixed,"red"))
    print(colored("-----Policy Name:","red"))
    print(colored(policy_name,"red"))
    print(colored("------","red"))
    print(colored("Launch the script with -h for further info","red"))

    model.learn(total_timesteps=timesteps)
    print("Saving model to panda.pkl")
    model.save("../pybullet_logs/pandareach_ddpg/policies"+ policy_name)

    del model # remove to demonstrate saving and loading
Esempio n. 29
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 30
0
from stable_baselines.ddpg.policies import LnMlpPolicy
from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines import DDPG
from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec
import numpy as np

powerenv = ActiveEnv()
powerenv.set_parameters({
    'state_space': ['sun', 'demand', 'imbalance'],
    'reward_terms': ['voltage', 'current', 'imbalance']
})

powerenv = DummyVecEnv([lambda: powerenv])
action_mean = np.zeros(powerenv.action_space.shape)
action_sigma = 0.3 * np.ones(powerenv.action_space.shape)
action_noise = OrnsteinUhlenbeckActionNoise(mean=action_mean,
                                            sigma=action_sigma)

param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                     desired_action_stddev=0.01)

t_steps = 800000
logdir = 'C:\\Users\\vegar\\Dropbox\\Master\\logs'
powermodel = DDPG(
    LnMlpPolicy,
    powerenv,
    verbose=2,
    action_noise=action_noise,
    gamma=0.99,
    #param_noise=param_noise,
    tensorboard_log=logdir,
    memory_limit=int(800000),