コード例 #1
0
    def train_DDPG(self, model_name, model_params = config.DDPG_PARAMS):
        """DDPG model"""
        from stable_baselines3 import DDPG
        from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise


        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions))


        start = time.time()
        model = DDPG('MlpPolicy', 
                    env_train,
                    batch_size=model_params['batch_size'],
                    buffer_size=model_params['buffer_size'],
                    action_noise=action_noise,
                    verbose=model_params['verbose'],
                    tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}"
                    )
        model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "DDPG_run")
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (DDPG): ', (end-start)/60,' minutes')
        return model
コード例 #2
0
def ddpg(env,
         hyper,
         policy="MlpPolicy",
         verbose=0,
         tensorboard_log=None,
         seed=0,
         use_sde=True,
         device="auto"):

    policy_kwargs = make_policy_kwargs(hyper, "ddpg")
    hyper = action_noise(hyper, "ddpg", n_actions=env.action_space.shape[0])

    model = DDPG(
        'MlpPolicy',
        env,
        verbose=verbose,
        tensorboard_log=tensorboard_log,
        seed=seed,
        gamma=hyper['params_gamma'],
        learning_rate=hyper['params_lr'],
        batch_size=np.int(hyper['params_batch_size']),
        buffer_size=np.int(hyper['params_buffer_size']),
        action_noise=hyper['params_action_noise'],
        train_freq=hyper['params_train_freq'],
        # gradient_steps = np.int(hyper['params_train_freq']),
        # n_episodes_rollout = np.int(hyper['params_n_episodes_rollout']),
        policy_kwargs=policy_kwargs,
        device=device)
    return model
コード例 #3
0
 def create_model(env, algorithm, save_path):
     # the noise object
     n_actions = env.action_space.shape[-1]
     action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                 sigma=float(0.2) *
                                                 np.ones(n_actions),
                                                 theta=0.15)
     if algorithm == "ddpg":
         return DDPG(DDPG_MlpPolicy,
                     env,
                     learning_rate=0.001,
                     buffer_size=1000000,
                     batch_size=64,
                     tau=0.001,
                     gamma=0.99,
                     train_freq=(10, "step"),
                     action_noise=action_noise,
                     policy_kwargs=dict(optimizer_class=th.optim.AdamW),
                     tensorboard_log=save_path)
     elif algorithm == "td3":
         return TD3(TD3_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     elif algorithm == "sac":
         return SAC(SAC_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
コード例 #4
0
ファイル: zz_rolling.py プロジェクト: Tinky2013/Paper-Reading
def train():
    best_reward, best_reward_timesteps = None, None
    save_path = "model_save/"+MODEL_PATH+"/"
    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)

    # log_dir = f"model_save/"
    log_dir = save_path
    env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT)
    env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir)
    env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    if PARAM['algo']=='td3':
        model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                    learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ddpg':
        model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                     learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ppo':
        model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'])

    eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model',
                                 log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'],
                                 deterministic=True, render=False)

    model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500)
    print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep)
    model.save(save_path+MODEL_PATH+'_final_timesteps')
コード例 #5
0
def main():
    """
   # Example with Vectorized env
   num_cpu = 4  # Number of processes to use
   my_env_kwargs={'renders': False}
   env = make_vec_env('panda-ip-reach-v0', n_envs=num_cpu, env_kwargs=my_env_kwargs)
   """

    # Example with a simple Dummy vec env
    env = gym.envs.make('panda-ip-reach-v0', renders=False)
    env = DummyVecEnv([lambda: env])

    #check_env(pandaenv)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    print("n_actions = {0}".format(n_actions))

    #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=0.1 * np.ones(n_actions))

    model = DDPG(policy='MlpPolicy',
                 env=env,
                 learning_rate=0.001,
                 buffer_size=1000000,
                 learning_starts=100,
                 batch_size=100,
                 tau=0.005,
                 gamma=0.99,
                 train_freq=1,
                 gradient_steps=-1,
                 action_noise=action_noise,
                 optimize_memory_usage=False,
                 tensorboard_log="./ddpg_panda_reach_tensorboard/",
                 create_eval_env=False,
                 policy_kwargs=None,
                 verbose=1,
                 seed=None,
                 device='auto',
                 _init_setup_model=True)
    """
   print("start model evaluation without learning !")
   mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=1)
   print("end model evaluation !")
   """
    print("start model learning !")
    model.learn(total_timesteps=200000, log_interval=10)
    print("end model learning !")

    print("-> model saved !!")
    model.save("ddpg_panda_reach")
    """
   print("start model evaluation with learning !")
   mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=1)
   print("end model evaluation !")
   """
    """
def objective(trial):
    noise = trial.suggest_uniform('Noise', 0.1, 0.8)
    timesteps = trial.suggest_int('Timesteps', 10, 100)

    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(noise) *
                                                np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise)
    model.learn(total_timesteps=timesteps * 1000, log_interval=1000)

    return test_model(env, model, '')
コード例 #7
0
def train_ddpg():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)
    model = DDPG("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1, learning_starts=500000)
    callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir)
    model.learn(total_timesteps=int(1000000), callback = callback, log_interval = 480)
    model.save('model_save/ddpg_cnn')
コード例 #8
0
    def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS):
        self.P = hyperparameters

        if self.P["model_class"] == "dqn":
            from stable_baselines3 import DQN
            self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"])
            self.model_class = DQN

        elif self.P["model_class"] == "a2c":
            from stable_baselines3 import A2C
            from stable_baselines3.a2c import MlpPolicy
            self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = A2C

        elif self.P["model_class"] == "ddpg":
            from stable_baselines3 import DDPG
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = DDPG('MlpPolicy',
                              env,
                              action_noise=action_noise,
                              verbose=self.P["verbose"])
            self.model_class = DDPG

        elif self.P["model_class"] == "td3":
            from stable_baselines3 import TD3
            from stable_baselines3.td3.policies import MlpPolicy
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = TD3(MlpPolicy,
                             env,
                             action_noise=action_noise,
                             verbose=self.P["verbose"])
            self.model_class = TD3

        elif self.P["model_class"] == "ppo":
            from stable_baselines3 import PPO
            from stable_baselines3.ppo import MlpPolicy
            self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = PPO

        elif self.P["model_class"] == "sac":
            from stable_baselines3 import SAC
            from stable_baselines3.sac import MlpPolicy
            self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = SAC

        else:
            raise NotImplementedError()
コード例 #9
0
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""

    # add the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    start = time.time()
    model = DDPG('MlpPolicy', env_train, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end-start)/60,' minutes')
    return model
コード例 #10
0
    def create(self, n_envs=1):
        """Create the agent"""
        self.env = self.agent_helper.env
        log_dir = self.agent_helper.config_dir
        os.makedirs(log_dir, exist_ok=True)
        self.env = Monitor(self.env, log_dir)
        #TODO:
        # Create DDPG policy and define its hyper parameter here! even the action space and observation space.
        # add policy
        policy_name = self.agent_helper.config['policy']
        self.policy = eval(policy_name)
        # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        n_actions = int(self.agent_helper.env.action_space.shape[0])
        action_noise = NormalActionNoise(
            mean=np.zeros(n_actions),
            sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions))

        #FIXME: test:
        # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path)

        # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct?
        # activ_function_name = self.agent_helper.config['nn_activ']
        # activ_function = eval(activ_function_name)

        # policy_kwargs = dict(activation_fn=activ_function,
        #              net_arch=[dict(pi=[32, 32], qf=[32, 32])])
        logger.info("Create the DDPG model")
        policy_kwargs = dict(net_arch=self.agent_helper.config['layers'])
        self.model = DDPG(
            self.policy,
            self.env,
            learning_rate=self.agent_helper.config['learning_rate'],
            buffer_size=self.agent_helper.config['buffer_size'],
            batch_size=self.agent_helper.config['batch_size'],
            tau=self.agent_helper.config['tau'],
            gamma=self.agent_helper.config['gamma'],
            gradient_steps=self.agent_helper.config['gradient_steps'],
            action_noise=action_noise,
            optimize_memory_usage=self.agent_helper.
            config['optimize_memory_usage'],
            create_eval_env=self.agent_helper.config['create_eval_env'],
            policy_kwargs=policy_kwargs,
            verbose=self.agent_helper.config['verbose'],
            learning_starts=self.agent_helper.config['learning_starts'],
            tensorboard_log=self.agent_helper.graph_path,
            seed=self.agent_helper.seed)
        pass
コード例 #11
0
def main():
    # Create log dir
    log_dir = './ddpg_data'
    os.makedirs(log_dir, exist_ok=True)

    vix_env = trading_vix_env.trading_vix_env()
    env = Monitor(vix_env, log_dir)

    # Create action noise because TD3 and DDPG use a deterministic policy
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    # Create the callback: check every 20000 steps
    callback = custom_call_back.CustomCallback(check_freq = 20000,log_dir = log_dir)
    # Create RL model
    model = DDPG('MlpPolicy',env,action_noise = action_noise, verbose=2,batch_size = 10000)
    # Train the agent
    model.learn(total_timesteps=int(5e9), callback=callback)
コード例 #12
0
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""
    # the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    start = time.time()
    param_noise = None
    # removed keyword "param_noise=param_noise" stable_baselines3 doesn't need this one
    model = DDPG('MlpPolicy', env_train, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end - start) / 60, ' minutes')
    return model
コード例 #13
0
def train():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    model = DDPG('MlpPolicy',
                 env,
                 verbose=1,
                 batch_size=PARAM['batch_size'],
                 seed=PARAM['seed'],
                 learning_starts=PARAM['learning_starts'])
    callback = SaveOnBestTrainingRewardCallback(check_freq=480,
                                                log_dir=log_dir)
    model.learn(total_timesteps=int(PARAM['total_time_step']),
                callback=callback,
                log_interval=480)
    model.save('model_save/' + MODEL_PATH)
コード例 #14
0
def train_DDPG(env):

    print(f"action space shape -1:{env.action_space.shape[-1]}")

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.02 * np.ones(n_actions))

    model = DDPG(
        'MlpPolicy',
        env,
        learning_rate=0.0003,
        learning_starts=5,
        train_freq=10,
        n_episodes_rollout=-1,
        buffer_size=100000,
        action_noise=action_noise,
        batch_size=128,
        verbose=2,
    )
    model.learn(total_timesteps=1000000, log_interval=1)

    model.save("DDPG_pkl")
コード例 #15
0
ファイル: DRL_battery.py プロジェクト: jajimer/energym
                        seed=args.seed,
                        tensorboard_log=args.tensorboard)
        #--------------------------------------------------------#
        #                           DDPG                         #
        #--------------------------------------------------------#
        elif args.algorithm == 'DDPG':
            if args.sigma:
                # noise objects for DDPG
                n_actions = env.action_space.shape[-1]
                action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                                 sigma=0.1 *
                                                 np.ones(n_actions))

            model = DDPG("MlpPolicy",
                         env,
                         action_noise=action_noise,
                         verbose=1,
                         seed=args.seed,
                         tensorboard_log=args.tensorboard)
        #--------------------------------------------------------#
        #                           A2C                          #
        #--------------------------------------------------------#
        elif args.algorithm == 'A2C':
            model = A2C('MlpPolicy',
                        env,
                        verbose=1,
                        learning_rate=args.learning_rate,
                        n_steps=args.n_steps,
                        gamma=args.gamma,
                        gae_lambda=args.gae_lambda,
                        ent_coef=args.ent_coef,
                        vf_coef=args.vf_coef,
コード例 #16
0
        return True


if __name__ == '__main__':
    # Instantiate Environment
    env_id = 'gym_spm:spm-v0'
    env = gym.make('gym_spm:spm-v0')

    # HyperParameters
    lr = 3e-4

    # Instantiate Model
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=-30 * np.zeros(n_actions),
                                     sigma=.75 * np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1)
    # model = PPO('MlpPolicy', env, tensorboard_log=log_dir)

    # Train OR Load Model
    model.learn(total_timesteps=25000)

    # model.save(model_dir_description)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    print("Mean Reward = ", mean_reward)

    epsi_sp_list = []
    action_list = []
コード例 #17
0
                        verbose=1)
    if ARGS.algo == 'td3':
        model = TD3(td3ddpgMlpPolicy,
                    train_env,
                    policy_kwargs=offpolicy_kwargs,
                    tensorboard_log=filename + '/tb/',
                    verbose=1) if ARGS.obs == ObservationType.KIN else TD3(
                        td3ddpgCnnPolicy,
                        train_env,
                        policy_kwargs=offpolicy_kwargs,
                        tensorboard_log=filename + '/tb/',
                        verbose=1)
    if ARGS.algo == 'ddpg':
        model = DDPG(td3ddpgMlpPolicy,
                     train_env,
                     policy_kwargs=offpolicy_kwargs,
                     tensorboard_log=filename + '/tb/',
                     verbose=1) if ARGS.obs == ObservationType.KIN else DDPG(
                         td3ddpgCnnPolicy,
                         train_env,
                         policy_kwargs=offpolicy_kwargs,
                         tensorboard_log=filename + '/tb/',
                         verbose=1)

    #### Create eveluation environment #########################
    if ARGS.obs == ObservationType.KIN:
        eval_env = gym.make(
            env_name,
            aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS,
            obs=ARGS.obs,
            act=ARGS.act)
コード例 #18
0
    log_dir = "./Logs/DDPG/"
    model_dir = "./Models/DDPG/"

    details = f"Model_v{train_version}_" + description

    log_dir_description = log_dir + details
    model_dir_description = model_dir + details

    # Instantiate Model
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=25.67 * np.ones(n_actions))
    model = DDPG(MlpPolicy,
                 env,
                 action_noise=action_noise,
                 verbose=1,
                 tensorboard_log=log_dir)

    # Train OR Load Model
    if train_model:
        model.learn(total_timesteps=25000, tb_log_name=details)
        model.save(model_dir_description)
    else:
        model.load(model_dir_description)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    print("Mean Reward = ", mean_reward)
                             deterministic=True,
                             render=False)

### DDPG Noise
### Try increasing the noise when retraining.
### Try less noise based on the policy plot.
n_actions = env.action_space.shape[-1]
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=1 * np.ones(n_actions))
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG(
    'MlpPolicy',
    env,
    action_noise=action_noise,
    verbose=1,
    tensorboard_log="./h={}/".format(horizons[rank]),
    gamma=0.99,
    learning_rate=0.0003,
)
# model = DDPG.load("Model_DDPG_FS_30.zip")
# model.learning_rate = 0.0003
# model.gamma = 0.99
# action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.05*np.ones(n_actions))
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.075 * np.ones(n_actions))
# model.action_noise = action_noise
trainer = Trainer(env)
trainer.retrain_rl(model,
                   episodes=20000,
                   path="./h={}/".format(horizons[rank]))
コード例 #20
0
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
leaderboard("A2C", ENV, mean_reward, std_reward, url)
print("algo:", "A2C", "env:", ENV, "mean reward:", mean_reward, "std:",
      std_reward)

## simulate and plot results for reference
df = env.simulate(model, reps=10)
env.plot(df, "results/a2c.png")
#policy = env.policyfn(model, reps=10)
#env.plot(policy, "results/a2c-policy.png")

## DDPG ######################################################################

# FIXME load best tuned parameters first...

model = DDPG('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_log)
model.learn(total_timesteps=300000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
leaderboard("DDPG", ENV, mean_reward, std_reward, url)
print("algo:", "DDPG", "env:", ENV, "mean reward:", mean_reward, "std:",
      std_reward)

## simulate and plot results for reference
df = env.simulate(model, reps=10)
env.plot(df, "results/ddpg.png")
#policy = env.policyfn(model, reps=10)
#env.plot(policy, "results/ddpg-policy.png")

## SAC #######################################################################

# FIXME read from YAML
コード例 #21
0
import os
import pybullet_envs
import kukakr5Arc

env = gym.make('kukakr5Arc-v1')

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=float(0.5) *
                                            np.ones(n_actions))

# model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise)
model = DDPG(MlpPolicy, env, verbose=1, action_noise=action_noise)
model.learn(total_timesteps=400000)
model.save(
    "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher"
)
print('Saving model.... Model saved')

del model  # remove to demonstrate saving and loading

model = DDPG.load(
    "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher",
    env=env)
print('Loading model.....Model loaded')

#env.render() goes before env.reset() for the render to work
#env.render()
コード例 #22
0
if hyper["noise_type"] == "normal":
    hyper["action_noise"] = NormalActionNoise(mean=np.zeros(n_actions),
                                              sigma=hyper['noise_std'] *
                                              np.ones(n_actions))
elif noise_type == "ornstein-uhlenbeck":
    hyper["action_noise"] = OrnsteinUhlenbeckActionNoise(
        mean=np.zeros(n_actions),
        sigma=hyper['noise_std'] * np.ones(n_actions))

model = DDPG('MlpPolicy',
             env,
             verbose=0,
             tensorboard_log=tensorboard_log,
             seed=seed,
             gamma=hyper['gamma'],
             learning_rate=hyper['lr'],
             batch_size=hyper['batch_size'],
             buffer_size=hyper['buffer_size'],
             action_noise=hyper['action_noise'],
             train_freq=hyper['train_freq'],
             gradient_steps=hyper['train_freq'],
             n_episodes_rollout=hyper['n_episodes_rollout'],
             policy_kwargs=policy_kwargs)
model = DDPG('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_log)
model.learn(total_timesteps=300000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
# Rescale score against optimum solution in this environment
opt = escapement(env)
opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100)
mean_reward = mean_reward / opt_reward
std_reward = std_reward / opt_reward
leaderboard("DDPG", ENV, mean_reward, std_reward, url)
コード例 #23
0
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make("IntelligentPantry-v1")
#env = gym.make("Reacher-v2")

observation = env.reset()
print(env.action_space)
a = 0.45
b = 0.45
f = 1200
log_path = os.path.join('training', 'Logs')
#env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model3 = TD3("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model2 = DDPG('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
model3.learn(total_timesteps=500000, log_interval=100)
eval = evaluate_policy(model3, env, n_eval_episodes=20, render=True)
# episodes = 5
# for episode in range(1, episodes+1):
#     state = env.reset()
#     done = False
#     score = 0
#
#     while not done:
#         env.render()
#         action = env.action_space.sample()
#         n_state, reward, done, info = env.step(action)
#         score += reward
#     print("Episode:{} Score:{}".format(episode, score))
# env.close()
コード例 #24
0
if __name__ == '__main__':
    env_id = 'gym_spm:spm-v0'
    num_cpu = 4  # Number of processes to use

    env = gym.make('gym_spm:spm-v0')
    # env = make_vec_env(env_id, n_envs=1, seed=0)
    # env = VecCheckNan(env, raise_exception=True)
    # env = check_env(env)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=.1 * np.ones(n_actions))

    # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/")
    model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/")
    model.learn(total_timesteps=25000, tb_log_name='DDPG_test_run_3_SOCpoint5_two_state')
    # model.save('DDPG_test_3_SOC_point5_two_states')
    #
    #
    # model.load('DDPG_test_2_SOC_point5_two_states')
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    
    print("Mean Reward = ", mean_reward)
    
    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
    
    env = SPMenv()

    # HyperParameters
    lr = 3e-4

    model_name = "DDGP_2.pt"
    model_path = "./Model/" + model_name

    # Instantiate Model
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=.75 * np.ones(n_actions))
    model = DDPG(MlpPolicy,
                 env,
                 action_noise=action_noise,
                 verbose=1,
                 train_freq=25000,
                 n_episodes_rollout=-1)
    # model = DDPG(MlpPolicy, env, verbose=1, train_freq=2500, n_episodes_rollout=-1)

    # wandb.watch(model)

    # Train OR Load Model
    model.learn(total_timesteps=25000)
    env.log_state = False

    model.save(model_path)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)
コード例 #26
0
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('Pendulum-v0')

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
#action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=0.1 * np.ones(n_actions))

model = DDPG('MlpPolicy',
             env,
             action_noise=action_noise,
             verbose=1,
             tensorboard_log="./ddpg_pendulum_tensorboard/")

print("start model evaluation without learning !")
mean_reward_before, std_reward_before = evaluate_policy(model,
                                                        env,
                                                        n_eval_episodes=100)
print("end model evaluation !")

print("start model learning !")
model.learn(total_timesteps=10000, log_interval=10)
print("end model learning !")

print("-> model saved !!")
model.save("ddpg_pendulum")
コード例 #27
0
    check_env(env, warn=True, skip_render_check=True)

    ####
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(env.N_ACTIONS),
                                                sigma=0.1 *
                                                np.ones(env.N_ACTIONS),
                                                dt=0.005)

    #### Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    #### Train the model ###############################################################################
    model = DDPG(CustomPolicy,
                 env,
                 verbose=1,
                 batch_size=64,
                 action_noise=action_noise)

    for i in range(step_iters):  # run for step_iters * training_timesteps

        model.learn(total_timesteps=training_timesteps)

        model.save("./models/ddpg" + str((i + 1) * training_timesteps))
        model.save_replay_buffer("./experiences/ddpg_experience" +
                                 str((i + 1) * training_timesteps))

        #### Show (and record a video of) the model's performance ##########################################
        env_test = RLTetherAviary(gui=False, record=True)
        obs = env_test.reset()
        start = time.time()
コード例 #28
0
def run_model_stablebaseline(flow_params,
                             num_cpus=1,
                             rollout_size=50,
                             num_steps=50,
                             algorithm="ppo",
                             exp_config=None):
    """Run the model for num_steps if provided.
    Parameters
    ----------
    flow_params : dict
        flow-specific parameters
    num_cpus : int
        number of CPUs used during training
    rollout_size : int
        length of a single rollout
    num_steps : int
        total number of training steps
    The total rollout length is rollout_size.
    Returns
    -------
    stable_baselines.*
        the trained model
    """
    from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

    if num_cpus == 1:
        constructor = env_constructor(params=flow_params, version=0)()
        # The algorithms require a vectorized environment to run
        env = DummyVecEnv([lambda: constructor])
    else:
        env = SubprocVecEnv([
            env_constructor(params=flow_params, version=i)
            for i in range(num_cpus)
        ])
    if algorithm == "PPO":
        from stable_baselines3 import PPO
        train_model = PPO('MlpPolicy', env, verbose=1, n_steps=rollout_size)
        train_model.learn(total_timesteps=num_steps)
        print("Learning Process is Done.")
        return train_model

    elif algorithm == "DDPG":
        from stable_baselines3 import DDPG
        from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
        import numpy as np
        if exp_config == 'singleagent_figure_eight':
            train_model = DDPG(
                'MlpPolicy',
                env,
                verbose=1,
                n_episodes_rollout=rollout_size,
                learning_starts=3000,
                learning_rate=0.0001,
                action_noise=OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(1),
                    sigma=0.15 * np.ones(1),
                    initial_noise=0.7 * np.ones(1)),
                tau=0.005,
                batch_size=128,
                tensorboard_log='tensorboard_ddpg',
                device='cuda',
            )
        else:
            train_model = DDPG(
                'MlpPolicy',
                env,
                verbose=1,
                n_episodes_rollout=rollout_size,
                learning_starts=1200,
                tensorboard_log='tensorboard_ddpg',
                learning_rate=0.0001,
                action_noise=OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(1),
                    sigma=0.15 * np.ones(1),
                    initial_noise=0.7 * np.ones(1)),
                tau=0.005,
                batch_size=512,
                device='cpu',
            )

        from tensorboard_baselines.callbacks_ddpg import TensorboardCallback
        train_model.learn(
            total_timesteps=num_steps,
            log_interval=2,
            eval_log_path='ddpg_log',
            eval_freq=2,
            eval_freq=10,
            #callback=[TensorboardCallback],
        )
        print("Learning Process is Done.")
        return train_model
コード例 #29
0
# A2C algorithm
for i in range(n_tests):
    test_name = 'saved_models/a2c_soccer_actions_env_1_' + str(i)
    n_actions = env.action_space.shape[-1]
    model = A2C('MlpPolicy', env)
    model.learn(total_timesteps=25000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

# DDPG algorithm
for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_1_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    policy_kwargs = dict(net_arch=[400, 300])
    model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

for i in range(n_tests):
コード例 #30
0
ファイル: train_push.py プロジェクト: qgallouedec/panda-gym
import gym
import panda_gym
from stable_baselines3 import DDPG, HerReplayBuffer

env = gym.make("PandaPush-v2")

model = DDPG(policy="MultiInputPolicy",
             env=env,
             replay_buffer_class=HerReplayBuffer,
             verbose=1)

model.learn(total_timesteps=100000)