コード例 #1
0
ファイル: sac.py プロジェクト: SaminYeasar/soft_td3
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau
        self.scale_R = args.scale_R
        self.reparam = args.reparam
        self.deterministic = args.deterministic
        self.target_update_interval = args.target_update_interval

        self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                     args.hidden_size)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.deterministic == False:
            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
            self.value_criterion = nn.MSELoss()
        else:
            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)

        self.soft_q_criterion = nn.MSELoss()
コード例 #2
0
    def __init__(self, cfg):
        self.device = cfg.device
        self.gamma = cfg.gamma
        self.batch_size = cfg.batch_size

        self.value_net = ValueNetwork(cfg.state_dim, cfg.action_dim,
                                      cfg.hidden_dim).to(self.device)
        self.policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim,
                                        cfg.hidden_dim).to(self.device)
        self.target_value_net = ValueNetwork(cfg.state_dim, cfg.action_dim,
                                             cfg.hidden_dim).to(self.device)
        self.target_value_net.load_state_dict(self.value_net.state_dict())
        self.target_policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim,
                                               cfg.hidden_dim).to(self.device)
        self.target_policy_net.load_state_dict(self.policy_net.state_dict())
        self.soft_tau = cfg.soft_tau

        self.value_lr = cfg.value_lr
        self.policy_lr = cfg.policy_lr
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # mean squared error
        self.value_criterion = nn.MSELoss()
        self.replay_buffer = ReplayBuffer(cfg.replay_buffer_size)
コード例 #3
0
    def __init__(self, state_size, action_size, num_agents):
        state_dim = state_size
        #agent_input_state_dim = state_size*2 # Previos state is passed in with with the current state.
        action_dim = action_size

        self.num_agents = num_agents

        max_size = 100000  ###
        self.replay = Replay(max_size)

        hidden_dim = 128

        self.critic_net = ValueNetwork(state_dim, action_dim,
                                       hidden_dim).to(device)
        self.target_critic_net = ValueNetwork(state_dim, action_dim,
                                              hidden_dim).to(device)

        self.actor_net = PolicyNetwork(state_dim, action_dim,
                                       hidden_dim).to(device)
        self.target_actor_net = PolicyNetwork(state_dim, action_dim,
                                              hidden_dim).to(device)

        for target_param, param in zip(self.target_critic_net.parameters(),
                                       self.critic_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_actor_net.parameters(),
                                       self.actor_net.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic_net.parameters(),
                                           lr=CRITIC_LEARNING_RATE)
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(),
                                          lr=ACTOR_LEARNING_RATE)
コード例 #4
0
ファイル: sac1.py プロジェクト: haewngX/PytorchRL
    def __init__(self, config, env):
        self.device = config.device

        self.gamma = config.gamma  # 折扣因子

        self.tau = config.tau

        # 学习率
        self.value_lr = config.value_lr
        self.soft_q_lr = config.soft_q_lr
        self.policy_lr = config.policy_lr

        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.shape[0]  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        # 初始化V网络
        self.value_net = ValueNetwork(self.num_states, 256).to(self.device)
        # 初始化V目标网络
        self.target_value_net = ValueNetwork(self.num_states,
                                             256).to(self.device)

        # V目标网络和V网络初始时参数一致
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param.data)

        # 初始化Q网络
        self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions,
                                       256).to(self.device)

        # 初始化策略网络
        self.policy_net = PolicyNetwork(self.num_states, self.num_actions,
                                        256).to(self.device)

        # 训练的优化器
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(),
                                           lr=self.soft_q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # 均方损失函数
        self.value_criterion = nn.MSELoss()
        self.soft_q_criterion = nn.MSELoss()
コード例 #5
0
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
            else:
                pass

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs,
                                      args.hidden_size).to(self.device)
            self.value_target = ValueNetwork(self.num_inputs,
                                             args.hidden_size).to(self.device)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size).to(self.device)
            hard_update(self.critic_target, self.critic)
コード例 #6
0
ファイル: decoupled_a2c.py プロジェクト: xchris1015/a2c
    def __init__(self, env, gamma, lr):

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)
コード例 #7
0
def main():
    parser = argparse.ArgumentParser('Parse configuration file')
    parser.add_argument('--config', type=str, default='configs/model.config')
    parser.add_argument('--gpu', default=False, action='store_true')
    args = parser.parse_args()
    config_file = args.config
    model_config = configparser.RawConfigParser()
    model_config.read(config_file)
    env_config = configparser.RawConfigParser()
    env_config.read('configs/env.config')

    # configure paths
    output_dir = os.path.splitext(os.path.basename(args.config))[0]
    output_dir = os.path.join('data', output_dir)
    if os.path.exists(output_dir):
        # raise FileExistsError('Output folder already exists')
        print('Output folder already exists')
    else:
        os.mkdir(output_dir)
    log_file = os.path.join(output_dir, 'output.log')
    shutil.copy(args.config, output_dir)
    initialized_weights = os.path.join(output_dir, 'initialized_model.pth')
    trained_weights = os.path.join(output_dir, 'trained_model.pth')

    # configure logging
    file_handler = logging.FileHandler(log_file, mode='w')
    stdout_handler = logging.StreamHandler(sys.stdout)
    logging.basicConfig(level=logging.INFO, handlers=[stdout_handler, file_handler],
                        format='%(asctime)s, %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S")

    # configure device
    device = torch.device("cuda:0" if torch.cuda.is_available() and args.gpu else "cpu")
    logging.info('Using device: {}'.format(device))

    # configure model
    state_dim = model_config.getint('model', 'state_dim')
    kinematic = env_config.getboolean('agent', 'kinematic')
    model = ValueNetwork(state_dim=state_dim, fc_layers=[100, 100, 100], kinematic=kinematic).to(device)
    logging.debug('Trainable parameters: {}'.format([name for name, p in model.named_parameters() if p.requires_grad]))

    # load simulated data from ORCA
    traj_dir = model_config.get('init', 'traj_dir')
    gamma = model_config.getfloat('model', 'gamma')
    capacity = model_config.getint('train', 'capacity')
    memory = initialize_memory(traj_dir, gamma, capacity, kinematic, device)

    # initialize model
    if os.path.exists(initialized_weights):
        model.load_state_dict(torch.load(initialized_weights))
        logging.info('Load initialized model weights')
    else:
        initialize_model(model, memory, model_config, device)
        torch.save(model.state_dict(), initialized_weights)
        logging.info('Finish initializing model. Model saved')

    # train the model
    train(model, memory, model_config, env_config, device, trained_weights)
    torch.save(model.state_dict(), trained_weights)
    logging.info('Finish initializing training model. Model saved')
コード例 #8
0
    def __init__(self, env_name, n_states, n_actions, memory_size, batch_size,
                 gamma, alpha, lr, action_bounds, reward_scale):
        self.env_name = env_name
        self.n_states = n_states
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.alpha = alpha
        self.lr = lr
        self.action_bounds = action_bounds
        self.reward_scale = reward_scale
        self.memory = Memory(memory_size=self.memory_size)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.policy_network = PolicyNetwork(
            n_states=self.n_states,
            n_actions=self.n_actions,
            action_bounds=self.action_bounds).to(self.device)
        self.q_value_network1 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.q_value_network2 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.value_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network.load_state_dict(
            self.value_network.state_dict())
        self.value_target_network.eval()

        self.value_loss = torch.nn.MSELoss()
        self.q_value_loss = torch.nn.MSELoss()

        self.value_opt = Adam(self.value_network.parameters(), lr=self.lr)
        self.q_value1_opt = Adam(self.q_value_network1.parameters(),
                                 lr=self.lr)
        self.q_value2_opt = Adam(self.q_value_network2.parameters(),
                                 lr=self.lr)
        self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr)
コード例 #9
0
ファイル: sac.py プロジェクト: km01/myrl
    def __init__(self, input_size, action_size, gamma, tau, alpha, hidden_size,
                 lr, device):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha
        self.device = device

        self.policy = Actor(input_size, hidden_size,
                            action_size).to(self.device)
        self.critic = Critic(input_size, hidden_size,
                             action_size).to(self.device)
        self.value = ValueNetwork(input_size, hidden_size).to(self.device)

        self.policy_optim = torch.optim.Adam(self.policy.parameters(), lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr)
        self.value_optim = torch.optim.Adam(self.value.parameters(), lr=lr)

        self.value_target = copy.deepcopy(self.value)
        self.value_target.requires_grad_(False)
コード例 #10
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        # self.action_range = [env.action_space.low, env.action_space.high]
        # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters
        self.action_range = [[-1, 1], [-1, 1]]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = 2
        # self.action_dim = 1

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
コード例 #11
0
ファイル: agent.py プロジェクト: wuyuup/iems469
    def __init__(self, env, args, work_dir):
        self.env = env
        self.args = args
        self.work_dir = work_dir

        self.n_action = self.env.action_space.n
        self.arr_actions = np.arange(self.n_action)
        self.memory = ReplayMemory(self.args.buffer_size, self.args.device)
        self.qNetwork = ValueNetwork(self.n_action,
                                     self.env).to(self.args.device)
        self.targetNetwork = ValueNetwork(self.n_action,
                                          self.env).to(self.args.device)
        self.qNetwork.train()
        self.targetNetwork.eval()
        self.optimizer = optim.RMSprop(self.qNetwork.parameters(),
                                       lr=0.00025,
                                       eps=0.001,
                                       alpha=0.95)
        self.crit = nn.MSELoss()
        self.eps = max(self.args.eps, self.args.eps_min)
        self.eps_delta = (
            self.eps - self.args.eps_min) / self.args.exploration_decay_speed
コード例 #12
0
def test_rotate():
    vn = ValueNetwork(14, [150, 150, 100], kinematic=False)
    state = JointState(2, 2, 0, 1, 0.3, 2, 4, 1, 0, 4, 2, 2, 0, 0.3)
    state = torch.Tensor(state).expand(1, 14)
    rotated_state = vn.rotate(state, torch.device('cpu')).squeeze().numpy()
    assert np.allclose(rotated_state,
                       [2, 1, 1, 0, 0.3, 0, 0, -2, 0, -2, 0.3, 0.6, 1, 0, 2],
                       atol=1e-06)

    vn = ValueNetwork(14, [150, 150, 100], kinematic=True)
    state = JointState(2, 2, 0, 1, 0.3, 2, 4, 1, 0, 4, 2, 2, 0, 0.3)
    state = torch.Tensor(state).expand(1, 14)
    rotated_state = vn.rotate(state, torch.device('cpu')).squeeze().numpy()
    assert np.allclose(
        rotated_state,
        [2, 1, 1, 0, 0.3, -np.pi / 2, 0, -2, 0, -2, 0.3, 0.6, 0, -1, 2],
        atol=1e-06)
コード例 #13
0
ファイル: decoupled_a2c.py プロジェクト: xchris1015/a2c
class A2CAgent():
    """
    init function
    input: env, which is the CartPole-v0
           gamma, 0.99 in this case
           lr, learning rate is 1e-4

    define: env = env, which is the CartPole-v0
            obs_dim: 4 obervations
                    Observation:
                    Type: Box(4)
                    Num	Observation                 Min         Max
                    0	Cart Position             -4.8            4.8
                    1	Cart Velocity             -Inf            Inf
                    2	Pole Angle                 -24 deg        24 deg
                    3	Pole Velocity At Tip      -Inf            Inf

            action_dim: 2 actions
                        Actions:
                        Type: Discrete(2)
                        Num	Action
                        0	Push cart to the left
                        1	Push cart to the right

            value_network: two layer network with input 4 (observation dim) and output 1 (reward?)
            policy_network: two layer network with input 4 (observation dim) and output 2 (action dim)

            value and policy optimizer using default Adam and learning rate
    """
    def __init__(self, env, gamma, lr):

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.lr = lr

        self.value_network = ValueNetwork(self.obs_dim, 1)
        self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)

        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=self.lr)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=self.lr)

    """
    input state to get the next action
    
    using policy network to get the next state by using softmax
    """

    def get_action(self, state):
        state = torch.FloatTensor(state)
        logits = self.policy_network.forward(state)
        dist = F.softmax(logits, dim=0)
        probs = Categorical(dist)

        return probs.sample().cpu().detach().item()

    """
    form trajectory get all of the information, and calculated the discounted_rewards
    use value network to train the states with new values and compute the loss between the value and target value by using MSE
    same logic for policy network
    
    FloatTensor = FLOAT TYPE ARRAY
   
   t
tensor([[1, 2, 3],
        [4, 5, 6]])
t.view(-1,1)
tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]]) 
     
    """

    def compute_loss(self, trajectory):
        states = torch.FloatTensor([sars[0] for sars in trajectory])
        actions = torch.LongTensor([sars[1]
                                    for sars in trajectory]).view(-1, 1)
        rewards = torch.FloatTensor([sars[2] for sars in trajectory])
        next_states = torch.FloatTensor([sars[3] for sars in trajectory])
        dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1)

        # compute value target

        ## Two for loop to calculate the discounted reward for each one
        discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma ** i for i in range(rewards[j:].size(0))]) \
                                        * rewards[j:]) for j in
                              range(rewards.size(0))]  # sorry, not the most readable code.
        value_targets = rewards.view(
            -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1)

        # compute value loss
        values = self.value_network.forward(states)
        value_loss = F.mse_loss(values, value_targets.detach())

        # compute policy loss with entropy bonus
        logits = self.policy_network.forward(states)
        dists = F.softmax(logits, dim=1)
        probs = Categorical(dists)

        # compute entropy bonus
        entropy = []
        for dist in dists:
            entropy.append(-torch.sum(dist.mean() * torch.log(dist)))
        entropy = torch.stack(entropy).sum()

        advantage = value_targets - values
        policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(
            -1, 1) * advantage.detach()
        policy_loss = policy_loss.mean() - 0.001 * entropy

        return value_loss, policy_loss

    """
        zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).

        loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
        
        opt.step() causes the optimizer to take a step based on the gradients of the parameters.
        """

    def update(self, trajectory):
        value_loss, policy_loss = self.compute_loss(trajectory)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
コード例 #14
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        # self.action_range = [env.action_space.low, env.action_space.high]
        # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters
        self.action_range = [[-1, 1], [-1, 1]]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = 2
        # self.action_dim = 1

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    # pi: state -> acton
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        '''if action < 0.5:
            return 0
        else:
            return 1'''
        scaled_action = []
        for idx, a in enumerate(action):
            action_range = self.action_range[idx]
            a = (action_range[1] - action_range[0]) / 2.0 + (
                action_range[1] + action_range[0]) / 2.0
            scaled_action.append(a)
        return scaled_action

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        #TODO: Question: why using 2 Q-networks?
        # To reduce bias in training.

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # delayed update for policy net and target value nets
        # TODO: Question: what does this part do?
        # The original paper mentioned 2 methods for approximating the value function
        # 1. the EMA of policy weights to update the Q network
        # 2. periodical update of the policy network, which is used in this code
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1
コード例 #15
0
class Agent():
    def __init__(self, state_size, action_size, num_agents):
        state_dim = state_size
        #agent_input_state_dim = state_size*2 # Previos state is passed in with with the current state.
        action_dim = action_size

        self.num_agents = num_agents

        max_size = 100000  ###
        self.replay = Replay(max_size)

        hidden_dim = 128

        self.critic_net = ValueNetwork(state_dim, action_dim,
                                       hidden_dim).to(device)
        self.target_critic_net = ValueNetwork(state_dim, action_dim,
                                              hidden_dim).to(device)

        self.actor_net = PolicyNetwork(state_dim, action_dim,
                                       hidden_dim).to(device)
        self.target_actor_net = PolicyNetwork(state_dim, action_dim,
                                              hidden_dim).to(device)

        for target_param, param in zip(self.target_critic_net.parameters(),
                                       self.critic_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_actor_net.parameters(),
                                       self.actor_net.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic_net.parameters(),
                                           lr=CRITIC_LEARNING_RATE)
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(),
                                          lr=ACTOR_LEARNING_RATE)

    def get_action(self, state):
        return self.actor_net.get_action(state)[0]

    def add_replay(self, state, action, reward, next_state, done):
        for i in range(self.num_agents):
            self.replay.add(state[i], action[i], reward[i], next_state[i],
                            done[i])

    def learning_step(self):

        #Check if relay buffer contains enough samples for 1 batch
        if (self.replay.cursize < BATCH_SIZE):
            return

        #Get Samples
        state, action, reward, next_state, done = self.replay.get(BATCH_SIZE)

        #calculate loss
        actor_loss = self.critic_net(state, self.actor_net(state))
        actor_loss = -actor_loss.mean()

        next_action = self.target_actor_net(next_state)
        target_value = self.target_critic_net(next_state, next_action.detach())
        expected_value = reward + (1.0 - done) * DISCOUNT_RATE * target_value

        value = self.critic_net(state, action)
        critic_loss = F.mse_loss(value, expected_value.detach())

        #backprop
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        #soft update
        self.soft_update(self.critic_net, self.target_critic_net, TAU)
        self.soft_update(self.actor_net, self.target_actor_net, TAU)

    def save(self, name):
        torch.save(self.critic_net.state_dict(), name + "_critic")
        torch.save(self.actor_net.state_dict(), name + "_actor")

    def load(self, name):
        self.critic_net.load_state_dict(torch.load(name + "_critic"))
        self.critic_net.eval()
        self.actor_net.load_state_dict(torch.load(name + "_actor"))
        self.actor_net.eval()

        for target_param, param in zip(self.target_critic_net.parameters(),
                                       self.critic_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_actor_net.parameters(),
                                       self.actor_net.parameters()):
            target_param.data.copy_(param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #16
0
    bxmax = env_config.getfloat('sim', 'xmax')
    bymin = env_config.getfloat('sim', 'ymin')
    bymax = env_config.getfloat('sim', 'ymax')
    xmin = env_config.getfloat('visualization', 'xmin')
    xmax = env_config.getfloat('visualization', 'xmax')
    ymin = env_config.getfloat('visualization', 'ymin')
    ymax = env_config.getfloat('visualization', 'ymax')
    crossing_radius = env_config.getfloat('sim', 'crossing_radius')
    kinematic = env_config.getboolean('agent', 'kinematic')
    radius = env_config.getfloat('agent', 'radius')

    device = torch.device('cpu')
    test_env = ENV(config=env_config, phase='test')
    test_env.reset(case)
    model = ValueNetwork(state_dim=state_dim,
                         fc_layers=[100, 100, 100],
                         kinematic=kinematic)
    model.load_state_dict(
        torch.load(weight_path, map_location=lambda storage, loc: storage))
    _, state_sequences, _, _ = run_one_episode(model, 'test', test_env, gamma,
                                               None, kinematic, device)

    positions = list()
    colors = list()
    counter = list()
    line_positions = list()
    for i in range(len(state_sequences[0])):
        counter.append(i)
        if state_sequences[0][i] is None:
            p0 = positions[-4][0]
            c0 = 'tab:red'
コード例 #17
0
ファイル: agent.py プロジェクト: wuyuup/iems469
class DQNAgent(object):
    def __init__(self, env, args, work_dir):
        self.env = env
        self.args = args
        self.work_dir = work_dir

        self.n_action = self.env.action_space.n
        self.arr_actions = np.arange(self.n_action)
        self.memory = ReplayMemory(self.args.buffer_size, self.args.device)
        self.qNetwork = ValueNetwork(self.n_action,
                                     self.env).to(self.args.device)
        self.targetNetwork = ValueNetwork(self.n_action,
                                          self.env).to(self.args.device)
        self.qNetwork.train()
        self.targetNetwork.eval()
        self.optimizer = optim.RMSprop(self.qNetwork.parameters(),
                                       lr=0.00025,
                                       eps=0.001,
                                       alpha=0.95)
        self.crit = nn.MSELoss()
        self.eps = max(self.args.eps, self.args.eps_min)
        self.eps_delta = (
            self.eps - self.args.eps_min) / self.args.exploration_decay_speed

    def reset(self):
        return torch.cat([preprocess_state(self.env.reset(), self.env)] * 4, 1)

    def select_action(self, state):
        action_prob = np.zeros(self.n_action, np.float32)
        action_prob.fill(self.eps / self.n_action)
        max_q, max_q_index = self.qNetwork(Variable(state.to(
            self.args.device))).data.cpu().max(1)
        action_prob[max_q_index[0]] += 1 - self.eps
        action = np.random.choice(self.arr_actions, p=action_prob)
        next_state, reward, done, _ = self.env.step(action)
        next_state = torch.cat(
            [state.narrow(1, 1, 3),
             preprocess_state(next_state, self.env)], 1)
        self.memory.push(
            (state, torch.LongTensor([int(action)]), torch.Tensor([reward]),
             next_state, torch.Tensor([done])))
        return next_state, reward, done, max_q[0]

    def run(self):
        state = self.reset()
        # init buffer
        for _ in range(self.args.buffer_init_size):
            next_state, _, done, _ = self.select_action(state)
            state = self.reset() if done else next_state

        total_frame = 0
        reward_list = np.zeros(self.args.log_size, np.float32)
        qval_list = np.zeros(self.args.log_size, np.float32)

        start_time = time.time()

        for epi in count():
            reward_list[epi % self.args.log_size] = 0
            qval_list[epi % self.args.log_size] = -1e9
            state = self.reset()
            done = False
            ep_len = 0

            if epi % self.args.save_freq == 0:
                model_file = os.path.join(self.work_dir, 'model.th')
                with open(model_file, 'wb') as f:
                    torch.save(self.qNetwork, f)

            while not done:
                if total_frame % self.args.sync_period == 0:
                    self.targetNetwork.load_state_dict(
                        self.qNetwork.state_dict())

                self.eps = max(self.args.eps_min, self.eps - self.eps_delta)
                next_state, reward, done, qval = self.select_action(state)
                reward_list[epi % self.args.log_size] += reward
                qval_list[epi % self.args.log_size] = max(
                    qval_list[epi % self.args.log_size], qval)
                state = next_state

                total_frame += 1
                ep_len += 1

                if ep_len % self.args.learn_freq == 0:
                    batch_state, batch_action, batch_reward, batch_next_state, batch_done = self.memory.sample(
                        self.args.batch_size)
                    batch_q = self.qNetwork(batch_state).gather(
                        1, batch_action.unsqueeze(1)).squeeze(1)
                    batch_next_q = self.targetNetwork(batch_next_state).detach(
                    ).max(1)[0] * self.args.gamma * (1 - batch_done)
                    loss = self.crit(batch_q, batch_reward + batch_next_q)
                    self.optimizer.zero_grad()

                    loss.backward()
                    self.optimizer.step()

            output_str = 'episode %d frame %d time %.2fs cur_rew %.3f mean_rew %.3f cur_maxq %.3f mean_maxq %.3f' % (
                epi, total_frame, time.time() - start_time,
                reward_list[epi % self.args.log_size], np.mean(reward_list),
                qval_list[epi % self.args.log_size], np.mean(qval_list))
            print(output_str)
            logging.info(output_str)
コード例 #18
0
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
            else:
                pass

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).unsqueeze(0)
        if eval == False:
            self.policy.train()
            action, _, _, _, _ = self.policy.sample(state)
        else:
            self.policy.eval()
            _, _, _, action, _ = self.policy.sample(state)
            if self.policy_type == "Gaussian":
                action = torch.tanh(action)
            else:
                pass
        #action = torch.tanh(action)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        action_batch = torch.FloatTensor(action_batch)
        reward_batch = torch.FloatTensor(reward_batch).unsqueeze(1)
        mask_batch = torch.FloatTensor(np.float32(mask_batch)).unsqueeze(1)
        """
        Use two Q-functions to mitigate positive bias in the policy improvement step that is known
        to degrade performance of value based methods. Two Q-functions also significantly speed
        up training, especially on harder task.
        """
        expected_q1_value, expected_q2_value = self.critic(
            state_batch, action_batch)
        new_action, log_prob, _, mean, log_std = self.policy.sample(
            state_batch)

        if self.policy_type == "Gaussian":
            if self.automatic_entropy_tuning:
                """
                Alpha Loss
                """
                alpha_loss = -(
                    self.log_alpha *
                    (log_prob + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = self.log_alpha.exp()
                alpha_logs = self.alpha.clone()  # For TensorboardX logs
            else:
                alpha_loss = torch.tensor(0.)
                alpha_logs = self.alpha  # For TensorboardX logs
            """
            Including a separate function approximator for the soft value can stabilize training.
            """
            expected_value = self.value(state_batch)
            target_value = self.value_target(next_state_batch)
            next_q_value = reward_batch + mask_batch * self.gamma * (
                target_value).detach()
        else:
            """
            There is no need in principle to include a separate function approximator for the state value.
            We use a target critic network for deterministic policy and eradicate the value value network completely.
            """
            alpha_loss = torch.tensor(0.)
            alpha_logs = self.alpha  # For TensorboardX logs
            next_state_action, _, _, _, _, = self.policy.sample(
                next_state_batch)
            target_critic_1, target_critic_2 = self.critic_target(
                next_state_batch, next_state_action)
            target_critic = torch.min(target_critic_1, target_critic_2)
            next_q_value = reward_batch + mask_batch * self.gamma * (
                target_critic).detach()
        """
        Soft Q-function parameters can be trained to minimize the soft Bellman residual
        JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1))
        """
        q1_value_loss = F.mse_loss(expected_q1_value, next_q_value)
        q2_value_loss = F.mse_loss(expected_q2_value, next_q_value)
        q1_new, q2_new = self.critic(state_batch, new_action)
        expected_new_q_value = torch.min(q1_new, q2_new)

        if self.policy_type == "Gaussian":
            """
            Including a separate function approximator for the soft value can stabilize training and is convenient to 
            train simultaneously with the other networks
            Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error.
            JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2]
            ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st)))
            """
            next_value = expected_new_q_value - (self.alpha * log_prob)
            value_loss = F.mse_loss(expected_value, next_value.detach())
        else:
            pass
        """
        Reparameterization trick is used to get a low variance estimator
        f(εt;st) = action sampled from the policy
        εt is an input noise vector, sampled from some fixed distribution
        Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
        ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st)
        """
        policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean()

        # Regularization Loss
        mean_loss = 0.001 * mean.pow(2).mean()
        std_loss = 0.001 * log_std.pow(2).mean()

        policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        q1_value_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        q2_value_loss.backward()
        self.critic_optim.step()

        if self.policy_type == "Gaussian":
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()
        else:
            value_loss = torch.tensor(0.)

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
            soft_update(self.critic_target, self.critic, self.tau)

        elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
            soft_update(self.value_target, self.value, self.tau)
        return value_loss.item(), q1_value_loss.item(), q2_value_loss.item(
        ), policy_loss.item(), alpha_loss.item(), alpha_logs

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))
コード例 #19
0
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs,
                                      args.hidden_size).to(self.device)
            self.value_target = ValueNetwork(self.num_inputs,
                                             args.hidden_size).to(self.device)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size).to(self.device)
            hard_update(self.critic_target, self.critic)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            self.policy.train()
            action, _, _ = self.policy.sample(state)
        else:
            self.policy.eval()
            _, _, action = self.policy.sample(state)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        pi, log_pi, _ = self.policy.sample(state_batch)

        if self.policy_type == "Gaussian":
            if self.automatic_entropy_tuning:
                alpha_loss = -(self.log_alpha *
                               (log_pi + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = self.log_alpha.exp()
                alpha_logs = torch.tensor(self.alpha)  # For TensorboardX logs
            else:
                alpha_loss = torch.tensor(0.).to(self.device)
                alpha_logs = torch.tensor(self.alpha)  # For TensorboardX logs

            vf = self.value(
                state_batch
            )  # separate function approximator for the soft value can stabilize training.
            with torch.no_grad():
                vf_next_target = self.value_target(next_state_batch)
                next_q_value = reward_batch + mask_batch * self.gamma * (
                    vf_next_target)
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_logs = self.alpha  # For TensorboardX logs
            with torch.no_grad():
                next_state_action, _, _, _, _, = self.policy.sample(
                    next_state_batch)
                # Use a target critic network for deterministic policy and eradicate the value value network completely.
                qf1_next_target, qf2_next_target = self.critic_target(
                    next_state_batch, next_state_action)
                min_qf_next_target = torch.min(qf1_next_target,
                                               qf2_next_target)
                next_q_value = reward_batch + mask_batch * self.gamma * (
                    min_qf_next_target)

        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        if self.policy_type == "Gaussian":
            vf_target = min_qf_pi - (self.alpha * log_pi)
            value_loss = F.mse_loss(
                vf, vf_target.detach()
            )  # JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2]

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        # Regularization Loss
        # mean_loss = 0.001 * mean.pow(2).mean()
        # std_loss = 0.001 * log_std.pow(2).mean()

        # policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        if self.policy_type == "Gaussian":
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()
        else:
            value_loss = torch.tensor(0.).to(self.device)

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
            soft_update(self.critic_target, self.critic, self.tau)

        elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
            soft_update(self.value_target, self.value, self.tau)
        return value_loss.item(), qf1_loss.item(), qf2_loss.item(
        ), policy_loss.item(), alpha_loss.item(), alpha_logs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))
コード例 #20
0
class SAC:
    def __init__(self, env_name, n_states, n_actions, memory_size, batch_size,
                 gamma, alpha, lr, action_bounds, reward_scale):
        self.env_name = env_name
        self.n_states = n_states
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.alpha = alpha
        self.lr = lr
        self.action_bounds = action_bounds
        self.reward_scale = reward_scale
        self.memory = Memory(memory_size=self.memory_size)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.policy_network = PolicyNetwork(
            n_states=self.n_states,
            n_actions=self.n_actions,
            action_bounds=self.action_bounds).to(self.device)
        self.q_value_network1 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.q_value_network2 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.value_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network.load_state_dict(
            self.value_network.state_dict())
        self.value_target_network.eval()

        self.value_loss = torch.nn.MSELoss()
        self.q_value_loss = torch.nn.MSELoss()

        self.value_opt = Adam(self.value_network.parameters(), lr=self.lr)
        self.q_value1_opt = Adam(self.q_value_network1.parameters(),
                                 lr=self.lr)
        self.q_value2_opt = Adam(self.q_value_network2.parameters(),
                                 lr=self.lr)
        self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr)

    def store(self, state, reward, done, action, next_state):
        state = from_numpy(state).float().to("cpu")
        reward = torch.Tensor([reward]).to("cpu")
        done = torch.Tensor([done]).to("cpu")
        action = torch.Tensor([action]).to("cpu")
        next_state = from_numpy(next_state).float().to("cpu")
        self.memory.add(state, reward, done, action, next_state)

    def unpack(self, batch):
        batch = Transition(*zip(*batch))

        states = torch.cat(batch.state).view(self.batch_size,
                                             self.n_states).to(self.device)
        rewards = torch.cat(batch.reward).view(self.batch_size,
                                               1).to(self.device)
        dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device)
        actions = torch.cat(batch.action).view(-1,
                                               self.n_actions).to(self.device)
        next_states = torch.cat(batch.next_state).view(
            self.batch_size, self.n_states).to(self.device)

        return states, rewards, dones, actions, next_states

    def train(self):
        if len(self.memory) < self.batch_size:
            return 0, 0, 0
        else:
            batch = self.memory.sample(self.batch_size)
            states, rewards, dones, actions, next_states = self.unpack(batch)

            # Calculating the value target
            reparam_actions, log_probs = self.policy_network.sample_or_likelihood(
                states)
            q1 = self.q_value_network1(states, reparam_actions)
            q2 = self.q_value_network2(states, reparam_actions)
            q = torch.min(q1, q2)
            target_value = q.detach() - self.alpha * log_probs.detach()

            value = self.value_network(states)
            value_loss = self.value_loss(value, target_value)

            # Calculating the Q-Value target
            with torch.no_grad():
                target_q = self.reward_scale * rewards + \
                           self.gamma * self.value_target_network(next_states) * (1 - dones)
            q1 = self.q_value_network1(states, actions)
            q2 = self.q_value_network2(states, actions)
            q1_loss = self.q_value_loss(q1, target_q)
            q2_loss = self.q_value_loss(q2, target_q)

            policy_loss = (self.alpha * log_probs - q).mean()

            self.policy_opt.zero_grad()
            policy_loss.backward()
            self.policy_opt.step()

            self.value_opt.zero_grad()
            value_loss.backward()
            self.value_opt.step()

            self.q_value1_opt.zero_grad()
            q1_loss.backward()
            self.q_value1_opt.step()

            self.q_value2_opt.zero_grad()
            q2_loss.backward()
            self.q_value2_opt.step()

            self.soft_update_target_network(self.value_network,
                                            self.value_target_network)

            return value_loss.item(), 0.5 * (
                q1_loss + q2_loss).item(), policy_loss.item()

    def choose_action(self, states):
        states = np.expand_dims(states, axis=0)
        states = from_numpy(states).float().to(self.device)
        action, _ = self.policy_network.sample_or_likelihood(states)
        return action.detach().cpu().numpy()[0]

    @staticmethod
    def soft_update_target_network(local_network, target_network, tau=0.005):
        for target_param, local_param in zip(target_network.parameters(),
                                             local_network.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def save_weights(self):
        torch.save(self.policy_network.state_dict(),
                   self.env_name + "_weights.pth")

    def load_weights(self):
        self.policy_network.load_state_dict(
            torch.load(self.env_name + "_weights.pth"))

    def set_to_eval_mode(self):
        self.policy_network.eval()
コード例 #21
0
class DDPG:
    def __init__(self, cfg):
        self.device = cfg.device
        self.gamma = cfg.gamma
        self.batch_size = cfg.batch_size

        self.value_net = ValueNetwork(cfg.state_dim, cfg.action_dim,
                                      cfg.hidden_dim).to(self.device)
        self.policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim,
                                        cfg.hidden_dim).to(self.device)
        self.target_value_net = ValueNetwork(cfg.state_dim, cfg.action_dim,
                                             cfg.hidden_dim).to(self.device)
        self.target_value_net.load_state_dict(self.value_net.state_dict())
        self.target_policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim,
                                               cfg.hidden_dim).to(self.device)
        self.target_policy_net.load_state_dict(self.policy_net.state_dict())
        self.soft_tau = cfg.soft_tau

        self.value_lr = cfg.value_lr
        self.policy_lr = cfg.policy_lr
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # mean squared error
        self.value_criterion = nn.MSELoss()
        self.replay_buffer = ReplayBuffer(cfg.replay_buffer_size)

    def update(self, cfg):
        state, action, reward, next_state, done = self.replay_buffer.sample(
            cfg.batch_size)
        # print(np.shape(state), np.shape(action), np.shape(reward), np.shape(next_state), np.shape(done))
        # (128, 3) (128, 1) (128,) (128, 3) (128,)
        state = torch.FloatTensor(state).to(cfg.device)
        action = torch.FloatTensor(action).to(cfg.device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)
        next_state = torch.FloatTensor(next_state).to(cfg.device)
        done = torch.FloatTensor(done).unsqueeze(1).to(cfg.device)

        self.value_net(state, self.policy_net(state))

        # Actor Loss
        policy_loss = self.value_net(state, self.policy_net(state))
        policy_loss = -policy_loss.mean()

        next_action = self.target_policy_net(next_state)
        target_value = self.target_value_net(next_state, next_action.detach())
        TD_target = reward + (1.0 - done) * self.gamma * target_value
        TD_target = torch.clamp(TD_target, -np.inf, np.inf)

        value = self.value_net(state, action)
        # Critic Loss
        value_loss = self.value_criterion(value, TD_target.detach())

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        # Update target network
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)

        for target_param, param in zip(self.target_policy_net.parameters(),
                                       self.policy_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) +
                                    param.data * self.soft_tau)
コード例 #22
0
ファイル: sac.py プロジェクト: SaminYeasar/soft_td3
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau
        self.scale_R = args.scale_R
        self.reparam = args.reparam
        self.deterministic = args.deterministic
        self.target_update_interval = args.target_update_interval

        self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                     args.hidden_size)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.deterministic == False:
            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
            self.value_criterion = nn.MSELoss()
        else:
            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)

        self.soft_q_criterion = nn.MSELoss()

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).unsqueeze(0)
        if eval == False:
            self.policy.train()
            _, _, action, _, _ = self.policy.evaluate(state)
        else:
            self.policy.eval()
            _, _, _, action, _ = self.policy.evaluate(state)

        action = torch.tanh(action)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        action_batch = torch.FloatTensor(action_batch)
        reward_batch = torch.FloatTensor(reward_batch)
        mask_batch = torch.FloatTensor(np.float32(mask_batch))

        reward_batch = reward_batch.unsqueeze(
            1)  # reward_batch = [batch_size, 1]
        mask_batch = mask_batch.unsqueeze(1)  # mask_batch = [batch_size, 1]
        """
        Use two Q-functions to mitigate positive bias in the policy improvement step that is known
        to degrade performance of value based methods. Two Q-functions also significantly speed
        up training, especially on harder task.
        """
        expected_q1_value, expected_q2_value = self.critic(
            state_batch, action_batch)
        new_action, log_prob, x_t, mean, log_std = self.policy.evaluate(
            state_batch, reparam=self.reparam)
        """
        Including a separate function approximator for the soft value can stabilize training.
        """
        expected_value = self.value(state_batch)
        target_value = self.value_target(next_state_batch)
        next_q_value = self.scale_R * reward_batch + mask_batch * self.gamma * target_value  # Reward Scale * r(st,at) - γV(target)(st+1))
        """
        Soft Q-function parameters can be trained to minimize the soft Bellman residual
        JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1))
        """
        q1_value_loss = self.soft_q_criterion(expected_q1_value,
                                              next_q_value.detach())
        q2_value_loss = self.soft_q_criterion(expected_q2_value,
                                              next_q_value.detach())

        q1_new, q2_new = self.critic(state_batch, new_action)
        expected_new_q_value = torch.min(q1_new, q2_new)
        """
        Including a separate function approximator for the soft value can stabilize training and is convenient to 
        train simultaneously with the other networks
        Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error.
        JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - log π(at|st)]))^2]
        ∇JV = ∇V(st)(V(st) - Q(st,at) + logπ(at|st))
        """
        next_value = expected_new_q_value - log_prob
        value_loss = self.value_criterion(expected_value, next_value.detach())
        log_prob_target = expected_new_q_value - expected_value

        if self.reparam == True:
            """
            Reparameterization trick is used to get a low variance estimator
            f(εt;st) = action sampled from the policy
            εt is an input noise vector, sampled from some fixed distribution
            Jπ = 𝔼st∼D,εt∼N[logπ(f(εt;st)|st)−Q(st,f(εt;st))]
            ∇Jπ =∇log π + ([∇at log π(at|st) − ∇at Q(st,at)])∇f(εt;st)
            """
            policy_loss = (log_prob - expected_new_q_value).mean()
        else:
            policy_loss = (log_prob * (log_prob - log_prob_target).detach()
                           ).mean()  # likelihood ratio gradient estimator

        # Regularization Loss
        mean_loss = 0.001 * mean.pow(2).mean()
        std_loss = 0.001 * log_std.pow(2).mean()

        policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        q1_value_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        q2_value_loss.backward()
        self.critic_optim.step()

        if self.deterministic == False:
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.deterministic == True:
            soft_update(self.critic_target, self.critic, self.tau)
            return 0, q1_value_loss.item(), q2_value_loss.item(
            ), policy_loss.item()
        elif updates % self.target_update_interval == 0 and self.deterministic == False:
            soft_update(self.value_target, self.value, self.tau)
            return value_loss.item(), q1_value_loss.item(), q2_value_loss.item(
            ), policy_loss.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))
コード例 #23
0
ファイル: sac1.py プロジェクト: haewngX/PytorchRL
class SAC(object):
    def __init__(self, config, env):
        self.device = config.device

        self.gamma = config.gamma  # 折扣因子

        self.tau = config.tau

        # 学习率
        self.value_lr = config.value_lr
        self.soft_q_lr = config.soft_q_lr
        self.policy_lr = config.policy_lr

        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.shape[0]  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        # 初始化V网络
        self.value_net = ValueNetwork(self.num_states, 256).to(self.device)
        # 初始化V目标网络
        self.target_value_net = ValueNetwork(self.num_states,
                                             256).to(self.device)

        # V目标网络和V网络初始时参数一致
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param.data)

        # 初始化Q网络
        self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions,
                                       256).to(self.device)

        # 初始化策略网络
        self.policy_net = PolicyNetwork(self.num_states, self.num_actions,
                                        256).to(self.device)

        # 训练的优化器
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(),
                                           lr=self.soft_q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # 均方损失函数
        self.value_criterion = nn.MSELoss()
        self.soft_q_criterion = nn.MSELoss()

    # 储存记忆
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.push((state, action, reward, next_state, done))

    # 选择动作
    def choose_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        action = action.detach().cpu().numpy()
        return action[0]

    # 获取动作的log_prob
    def get_action_log_prob(self, s, epsilon=1e-6):
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = log_prob.sum(-1, keepdim=True)
        # log_prob = Normal(mean, std).log_prob(mean + std * z.to(self.device)) - torch.log(1 - action.pow(2) + epsilon)  # reparameterization

        return action, log_prob, z, mean, log_std

    # 从经验池中选取样本
    def get_batch(self):
        transitions, _, _ = self.memory.sample(self.batch_size)  # 批样本

        # 解压批样本
        # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)]
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
            *transitions)

        # 将样本转化为tensor
        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(
                                        -1, 1)  # view转换为列tensor
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(-1, 1)
        batch_next_state = torch.tensor(batch_next_state,
                                        device=self.device,
                                        dtype=torch.float)
        batch_done = torch.tensor(batch_done,
                                  device=self.device,
                                  dtype=torch.float).squeeze().view(-1, 1)
        # print("状态:", batch_state.shape) 128,4
        # print("动作:", batch_action.shape)
        # print("奖励:", batch_reward.shape)
        # print("done:", batch_done.shape)
        #
        return batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _

    # 学习
    def learn(self):
        # 获取批样本
        batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ = self.get_batch(
        )

        # print("状态:", batch_state)
        # print("动作:", batch_action)
        # print("done:", batch_done)

        expected_q_value = self.soft_q_net(batch_state, batch_action)  # q(s,a)
        expected_value = self.value_net(batch_state)  # v(s)
        new_action, log_prob, z, mean, log_std = self.get_action_log_prob(
            batch_state)  # a~, logpi(a~|s), dist, 均值,标准差

        target_value = self.target_value_net(batch_next_state)  # vtar(s')
        next_q_value = batch_reward + (
            1 -
            batch_done) * self.gamma * target_value  # r + gamma*(1-d)*vtar(s')
        q_value_loss = self.soft_q_criterion(expected_q_value,
                                             next_q_value.detach()).mean()

        expected_new_q_value = self.soft_q_net(batch_state,
                                               new_action)  # q(s,a~)
        next_value = expected_new_q_value - log_prob
        value_loss = self.value_criterion(expected_value,
                                          next_value.detach()).mean()

        log_prob_target = expected_new_q_value - expected_value  # q(s,a) - v(s)
        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()

        self.soft_q_optimizer.zero_grad()
        q_value_loss.backward()
        self.soft_q_optimizer.step()

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        # 学习的步数加一
        self.learn_step_counter += 1

    # 保存模型
    def save(self):
        torch.save(self.soft_q_net, 'sac1_q.pkl')
        torch.save(self.value_net, 'sac1_v.pkl')
        torch.save(self.policy_net, 'sac1_policy.pkl')

    # 加载模型
    def load(self):
        self.soft_q_net = torch.load('sac1_q.pkl')
        self.value_net = torch.load('sac1_v.pkl')
        self.policy_net = torch.load('sac1_policy.pkl')
コード例 #24
0
ファイル: sac.py プロジェクト: km01/myrl
class SACV(object):
    def __init__(self, input_size, action_size, gamma, tau, alpha, hidden_size,
                 lr, device):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha
        self.device = device

        self.policy = Actor(input_size, hidden_size,
                            action_size).to(self.device)
        self.critic = Critic(input_size, hidden_size,
                             action_size).to(self.device)
        self.value = ValueNetwork(input_size, hidden_size).to(self.device)

        self.policy_optim = torch.optim.Adam(self.policy.parameters(), lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr)
        self.value_optim = torch.optim.Adam(self.value.parameters(), lr=lr)

        self.value_target = copy.deepcopy(self.value)
        self.value_target.requires_grad_(False)

    def select_action(self, obs, sample=True):
        obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0)
        policy = TanhGaussian(*self.policy(obs))
        action = policy.sample(sample)
        action = action.detach().cpu().numpy()[0]
        return action

    def update_parameters(self, batch):

        obs, act, rew, done, obs_next = batch
        obs = torch.FloatTensor(obs).to(self.device)
        act = torch.FloatTensor(act).to(self.device)
        rew = torch.FloatTensor(rew).unsqueeze(-1).to(self.device)
        done = torch.BoolTensor(done).unsqueeze(-1).to(self.device)
        obs_next = torch.FloatTensor(obs_next).to(self.device)

        with torch.no_grad():
            next_v = self.value_target(obs_next).masked_fill(done, 0.)
            q_targ = rew + self.gamma * next_v

        self.critic_optim.zero_grad()
        q1, q2 = self.critic(obs, act)
        critic_loss = (q1 - q_targ).pow(2.).mul(0.5) + (
            q2 - q_targ).pow(2.).mul(0.5)
        critic_loss = critic_loss.mean()
        critic_loss.backward()
        self.critic_optim.step()
        with torch.no_grad():
            critic_loss = (torch.min(q1, q2) - q_targ).pow(2).mul(0.5).mean()

        self.policy_optim.zero_grad()
        policy = TanhGaussian(*self.policy(obs))
        action = policy.sample()
        log_pi = policy.log_prob(action, param_grad=False).sum(dim=-1,
                                                               keepdim=True)

        action_value = torch.min(*self.critic(obs, action))

        with torch.no_grad():
            v_targ = action_value - self.alpha * log_pi

        self.value_optim.zero_grad()
        v = self.value(obs)
        value_loss = (v - v_targ).pow(2.).mul(0.5)
        value_loss = value_loss.mean()
        value_loss.backward()
        self.value_optim.step()

        policy_loss = self.alpha * log_pi - action_value
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.policy_optim.step()

        soft_update(self.value_target, self.value, self.tau)
        loss_info = {
            'critic_loss': critic_loss.item(),
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'policy_entropy': -log_pi.mean().item()
        }
        return loss_info