Beispiel #1
0
    np.random.seed(args.seed)

    # create environment
    env = gym.make(args.env)
    env.seed(args.seed)
    env.action_space.seed(args.seed)
    train_tools.EVAL_SEED = args.seed

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_bound = env.action_space.high[0]

    # create nets
    policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
                                                  hidden_size=[256, 256], hidden_activation=nn.ReLU)
    q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
                       hidden_activation=nn.ReLU)

    q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
                       hidden_activation=nn.ReLU)

    # create buffer
    if args.show:
        replay_buffer = None
    else:
        replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                     act_dim=act_dim,
                                     capacity=args.capacity,
                                     batch_size=args.batch_size)

    agent = SAC_Agent(env,
                      replay_buffer=replay_buffer,
Beispiel #2
0
    # env = gym.make('LunarLanderContinuous-v2')
    # env = gym.make('BipedalWalker-v3')

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_bound = env.action_space.high[0]

    # create nets
    actor_net = DDPGMLPActor(obs_dim=obs_dim,
                             act_dim=act_dim,
                             act_bound=act_bound,
                             hidden_size=[400, 300],
                             hidden_activation=nn.ReLU)

    critic_net = MLPQsaNet(obs_dim=obs_dim,
                           act_dim=act_dim,
                           hidden_size=[400, 300],
                           hidden_activation=nn.ReLU)

    # create optimizer
    actor_optimizer = torch.optim.Adam(actor_net.parameters(), lr=1e-4)
    critic_optimizer = torch.optim.Adam(critic_net.parameters(), lr=1e-3)

    # create buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 capacity=50000,
                                 batch_size=64)

    # create agent
    agent = DDPG_Agent(
        env=env,
Beispiel #3
0
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # create environment
    env = gym.make(args.env)
    env.seed(args.seed)
    env.action_space.seed(args.seed)
    train_tools.EVAL_SEED = args.seed

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_bound = env.action_space.high[0]

    critic_net1 = MLPQsaNet(obs_dim=obs_dim,
                            act_dim=act_dim,
                            hidden_size=[400, 300],
                            hidden_activation=nn.ReLU)
    critic_net2 = MLPQsaNet(obs_dim=obs_dim,
                            act_dim=act_dim,
                            hidden_size=[400, 300],
                            hidden_activation=nn.ReLU)

    perturbation_net = BCQ_Perturbation(obs_dim=obs_dim,
                                        act_dim=act_dim,
                                        act_bound=act_bound,
                                        hidden_size=[400, 300],
                                        hidden_activation=nn.ReLU,
                                        phi=0.05)

    cvae_net = CVAE(obs_dim=obs_dim,
                    act_dim=act_dim,
Beispiel #4
0
    def __init__(
        self,
        env,
        data_buffer: OfflineBuffer,
        policy_net: MLPSquashedReparamGaussianPolicy,  # actor
        q_net1: MLPQsaNet,  # critic
        q_net2: MLPQsaNet,
        cvae_net: CVAE,
        policy_lr=1e-4,
        qf_lr=3e-4,
        cvae_lr=3e-4,
        gamma=0.99,
        tau=0.05,

        # BEAR
        lmbda=0.75,  # used for double clipped double q-learning
        mmd_sigma=20.0,  # the sigma used in mmd kernel
        kernel_type='gaussian',  # the type of mmd kernel(gaussian or laplacian)
        lagrange_thresh=0.05,  # the hyper-parameter used in automatic tuning alpha in cql loss
        n_action_samples=100,  # the number of action samples to compute the best action when choose action
        n_target_samples=10,  # the number of action samples to compute BCQ-like target value
        n_mmd_action_samples=4,  # the number of action samples to compute MMD.
        warmup_step=40000,  # do support matching with a warm start before policy(actor) train
        max_train_step=1000000,
        log_interval=1000,
        eval_freq=5000,
        train_id="bear_hopper-medium-v2_test",
        resume=False,  # if True, train from last checkpoint
        device='cpu',
    ):

        self.env = env
        self.data_buffer = data_buffer

        self.device = torch.device(device)

        # the network and optimizers
        self.policy_net = policy_net.to(self.device)
        self.q_net1 = q_net1.to(self.device)
        self.q_net2 = q_net2.to(self.device)
        self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device)
        self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device)
        self.cvae_net = cvae_net.to(self.device)
        self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                                 lr=policy_lr)
        self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(),
                                             lr=qf_lr)
        self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(),
                                             lr=qf_lr)
        self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(),
                                               lr=cvae_lr)

        self.gamma = gamma
        self.tau = tau

        self.max_train_step = max_train_step
        self.eval_freq = eval_freq
        self.train_step = 0

        self.resume = resume  # whether load checkpoint start train from last time

        # BEAR
        self.lmbda = lmbda
        self.mmd_sigma = mmd_sigma
        self.kernel_type = kernel_type
        self.lagrange_thresh = lagrange_thresh
        self.n_action_samples = n_action_samples
        self.n_target_samples = n_target_samples
        self.n_mmd_action_samples = n_mmd_action_samples
        self.warmup_step = warmup_step

        # mmd loss's temperature
        self.log_alpha_prime = torch.zeros(1,
                                           requires_grad=True,
                                           device=self.device)
        self.alpha_prime_optimizer = torch.optim.Adam([self.log_alpha_prime],
                                                      lr=1e-3)

        # log dir and interval
        self.log_interval = log_interval
        self.result_dir = os.path.join(log_tools.ROOT_DIR, "run/results",
                                       train_id)
        log_tools.make_dir(self.result_dir)
        self.checkpoint_path = os.path.join(self.result_dir, "checkpoint.pth")
        self.tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
Beispiel #5
0
    def __init__(
        self,
        env,
        data_buffer: OfflineBuffer,
        critic_net1: MLPQsaNet,
        critic_net2: MLPQsaNet,
        actor_net: PLAS_Actor,
        cvae_net: CVAE,  # generation model
        critic_lr=1e-3,
        actor_lr=1e-4,
        cvae_lr=1e-4,
        gamma=0.99,
        tau=0.005,
        lmbda=0.75,  # used for double clipped double q-learning
        max_cvae_iterations=500000,  # the num of iterations when training CVAE model
        max_train_step=2000000,
        log_interval=1000,
        eval_freq=5000,
        train_id="plas_test",
        resume=False,  # if True, train from last checkpoint
        device='cpu',
    ):
        self.env = env
        self.data_buffer = data_buffer
        self.device = torch.device(device)

        self.critic_net1 = critic_net1.to(self.device)
        self.critic_net2 = critic_net2.to(self.device)
        self.target_critic_net1 = copy.deepcopy(self.critic_net1).to(
            self.device)
        self.target_critic_net2 = copy.deepcopy(self.critic_net2).to(
            self.device)
        self.actor_net = actor_net.to(self.device)
        self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device)
        self.cvae_net = cvae_net.to(self.device)
        self.critic_optimizer1 = torch.optim.Adam(
            self.critic_net1.parameters(), lr=critic_lr)
        self.critic_optimizer2 = torch.optim.Adam(
            self.critic_net2.parameters(), lr=critic_lr)
        self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=actor_lr)
        self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(),
                                               lr=cvae_lr)

        self.gamma = gamma
        self.tau = tau
        self.lmbda = lmbda

        self.max_cvae_iterations = max_cvae_iterations
        self.max_train_step = max_train_step
        self.eval_freq = eval_freq
        self.cvae_iterations = 0
        self.train_step = 0

        self.resume = resume  # whether load checkpoint start train from last time

        # log dir and interval
        self.log_interval = log_interval
        self.result_dir = os.path.join(log_tools.ROOT_DIR, "run/results",
                                       train_id)
        log_tools.make_dir(self.result_dir)
        self.checkpoint_path = os.path.join(self.result_dir, "checkpoint.pth")
        self.tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
Beispiel #6
0

if __name__ == '__main__':
    # create environment
    env = gym.make("Pendulum-v0")
    # env = gym.make('LunarLanderContinuous-v2')
    # env = gym.make('BipedalWalker-v3')

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_bound = env.action_space.high[0]

    # create nets
    policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound,
                                                  hidden_size=[256, 256], hidden_activation=nn.ReLU)
    q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
                       hidden_activation=nn.ReLU)

    q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256],
                       hidden_activation=nn.ReLU)

    policy_optimizer = torch.optim.Adam(policy_net.parameters(), lr=4e-3)
    q_optimizer1 = torch.optim.Adam(q_net1.parameters(), lr=4e-3)
    q_optimizer2 = torch.optim.Adam(q_net2.parameters(), lr=4e-3)

    # create buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 capacity=50000,
                                 batch_size=128)

    agent = SAC_Agent(env,