Ejemplo n.º 1
0
 def load(self, path, env):
     if self.trpo():
         return TRPO.load(path, env=env)
     elif self.ppo():
         return PPO2.load(path, env=env)
     else:
         return SAC.load(path, env=env)
Ejemplo n.º 2
0
    def create_learner(self, env, parameters):
        if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv):
            env = DummyVecEnv([lambda: env])

        if self.trpo():
            model = TRPO(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = TRPOInterface(model, env.observation_space.shape[0])
        elif self.ppo():
            model = PPO2(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = PPOInterface(model, env.observation_space.shape[0])
        else:
            model = SAC(SACMlpPolicy, env, **parameters["common"],
                        **parameters[str(self)])
            interface = SACInterface(model, env.observation_space.shape[0])

        if "pretrain_data_path" in parameters:
            data_path = parameters["pretrain_data_path"]
            model.pretrain(ExpertDataset(expert_path=data_path, verbose=0),
                           n_epochs=25)

        return model, interface
        model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-PPO")
        # model.learn(total_timesteps=1000000, tb_log_name="tb/PPO")

        ######################## TRPO ###########################
        log_dir = "./logs/%s/AVEC-TRPO_%s" % (env_id, seed)
        # log_dir = "./logs/%s/TRPO_%s" % (env_id, seed)
        os.makedirs(log_dir, exist_ok=True)
        env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir)
        model = TRPO('MlpPolicy',
                     env,
                     verbose=1,
                     avec_coef=1.,
                     vf_coef=0.,
                     tensorboard_log=log_dir)
        model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-TRPO")
        # model.learn(total_timesteps=1000000, tb_log_name="tb/TRPO")

        ######################### SAC #############################
        log_dir = "./logs/%s/AVEC-SAC_%s" % (env_id, seed)
        # log_dir = "./logs/%s/SAC_%s" % (env_id, seed)
        os.makedirs(log_dir, exist_ok=True)
        env = make_vec_env(env_id, 1, seed, monitor_dir=log_dir)
        model = SAC('CustomSACPolicy',
                    env,
                    verbose=1,
                    avec_coef=1.,
                    value_coef=0.,
                    tensorboard_log=log_dir)
        model.learn(total_timesteps=10000, tb_log_name="tb/AVEC-SAC")
        # model.learn(total_timesteps=1000000, tb_log_name="tb/SAC")
Ejemplo n.º 4
0
import gym
import gym_handOfJustice
import cv2
import tensorflow as tf
from stable_baselines.sac import SAC
from stable_baselines.sac.policies import LnCnnPolicy
import os

strea = cv2.VideoCapture(os.getcwd() + "\\dataset\\%06d.png")
if not strea.isOpened():
    raise Exception("Problem exporting the video stream")
env = gym.make("handOfJustice-v0", cap=strea, epsilon=300)
#tf.test.is_gpu_available()
model = SAC(LnCnnPolicy,
            env,
            verbose=1,
            tensorboard_log=os.getcwd() + "\\logs\\",
            full_tensorboard_log=True)
model.load("handicap_justice")
model.learn(total_timesteps=100000, log_interval=10)
model.save("handicap_justice")
#model.load("handicap_justice")

import time

time.sleep(3)
print("\n" + ("=" * 20) + "\nTraining complete\n" + ("=" * 20) + "\n\n")
## No is the first image gonna be taken
obs = env.reset()
done = False
i = 45000
Ejemplo n.º 5
0
def run(env_id, seed, layer_norm, evaluation, agent, delay_step, gamma=0.99, **kwargs):
    # Create envs.
    env = create_env(env_id, delay_step, str(0))
    print(env.observation_space, env.action_space)
    if evaluation:
        eval_env = create_env(env_id, delay_step, "eval_env")
    else:
        eval_env = None

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = time.time()

    policy = 'MlpPolicy'
    td3_variants = {
        "TD3": TD3,
        "TD3SIL": TD3SIL,
        "TD3NSTEP": TD3NSTEP,
        "TD3REDQ": TD3REDQ,
        "TD3DoubleTwin": TD3DoubleTwin,
    }
    if td3_variants.get(agent, None):
        model_func = td3_variants[agent]
        model = model_func(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                           tau=0.005, policy_delay=2, learning_starts=25000,
                           action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                           n_cpu_tf_sess=10,
                           policy_kwargs={"layers": [400, 300]})
    elif agent == "DDPG":
        model = DDPG(policy=policy, env=env, eval_env=eval_env, gamma=gamma, nb_eval_steps=5, batch_size=100,
                     nb_train_steps=100, nb_rollout_steps=100, learning_starts=10000,
                     actor_lr=1e-3, critic_lr=1e-3, critic_l2_reg=0,
                     tau=0.005, normalize_observations=False,
                     action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6),
                     verbose=2, n_cpu_tf_sess=10,
                     policy_kwargs={"layers": [400, 300]})
    elif agent == "SAC":
        model = SAC(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=256,
                    action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2,
                    n_cpu_tf_sess=10, learning_starts=10000,
                    policy_kwargs={"layers": [256, 256]})
    elif agent == "GEM":
        policy = 'TD3LnMlpPolicy'
        model = TD3MemGEM(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                          tau=0.005, policy_delay=2, learning_starts=25000,
                          action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                          n_cpu_tf_sess=10,
                          alpha=0.5, beta=-1, iterative_q=-1,
                          num_q=4, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10,
                          policy_kwargs={"layers": [400, 300]})
    elif agent == "BP":
        policy = 'TD3LnMlpPolicy'
        model = TD3MemBackProp(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                          tau=0.005, policy_delay=2, learning_starts=25000,
                          action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                          n_cpu_tf_sess=10,
                          alpha=0.5, beta=-1, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10,
                          policy_kwargs={"layers": [400, 300]})
    else:
        raise NotImplementedError

    print("model building finished")
    model.learn(total_timesteps=kwargs['num_timesteps'])

    env.close()
    if eval_env is not None:
        eval_env.close()

    logger.info('total runtime: {}s'.format(time.time() - start_time))
                    self.model.save(self.save_path)

        return True


# Create log dir
log_dir = "/home/prem/Downloads/Hyper_4"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = gym.make('BipedalWalker-v3')
# Logs will be saved in log_dir/monitor.csv
env = Monitor(env, log_dir)
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
model = SAC(
    MlpPolicy,
    env,
    verbose=1,
    #n_timesteps= int(float(1e6)),
    learning_rate=3e-4,
    #buffer_size= 1000000,
    batch_size=64,
    ent_coef=0.005,
    train_freq=1,
    gradient_steps=1,
    learning_starts=1000)
# Train the agent
model.learn(total_timesteps=1000000, callback=callback)