Esempio n. 1
0
    def __init__(self, state_dim, use_gpu=True):
        super(Value_Model, self).__init__()

        self.basic_model = Basic_Model().float().to(set_device(use_gpu))

        self.critic_layer = nn.Sequential(nn.Linear(128, 1)).float().to(
            set_device(use_gpu))
Esempio n. 2
0
    def __init__(self, use_gpu=True):
        super(Basic_Model, self).__init__()

        self.conv1 = nn.Sequential(
            DepthwiseSeparableConv2d(3, 16, kernel_size=3, stride=1,
                                     padding=1),
            nn.ReLU(),
            DepthwiseSeparableConv2d(16,
                                     32,
                                     kernel_size=4,
                                     stride=2,
                                     padding=1),
            nn.ReLU(),
            DepthwiseSeparableConv2d(32,
                                     64,
                                     kernel_size=4,
                                     stride=2,
                                     padding=1),
            nn.ReLU(),
        ).float().to(set_device(use_gpu))

        self.conv2 = nn.Sequential(
            DepthwiseSeparableConv2d(64,
                                     128,
                                     kernel_size=4,
                                     stride=2,
                                     padding=1),
            nn.ReLU(),
            DepthwiseSeparableConv2d(128,
                                     256,
                                     kernel_size=4,
                                     stride=2,
                                     padding=1),
            nn.ReLU(),
        ).float().to(set_device(use_gpu))

        self.conv3 = nn.Sequential(
            DepthwiseSeparableConv2d(64,
                                     128,
                                     kernel_size=8,
                                     stride=4,
                                     padding=2),
            nn.ReLU(),
            DepthwiseSeparableConv2d(128,
                                     256,
                                     kernel_size=3,
                                     stride=1,
                                     padding=1),
            nn.ReLU(),
        ).float().to(set_device(use_gpu))

        self.state_extractor = nn.Sequential(nn.Linear(1, 64),
                                             nn.ReLU()).float().to(
                                                 set_device(use_gpu))

        self.nn_layer = nn.Sequential(
            nn.Linear(320, 128),
            nn.ReLU(),
        ).float().to(set_device(use_gpu))
    def __init__(self, state_dim, use_gpu = True):
      super(Value_Model, self).__init__()

      self.conv                 = CnnModel().float().to(set_device(use_gpu))
      self.memory_layer         = nn.LSTM(256, 256).float().to(set_device(use_gpu))

      self.state_extractor      = nn.Sequential( nn.Linear(2, 64), nn.ReLU() ).float().to(set_device(use_gpu))
      self.nn_layer             = nn.Sequential( nn.Linear(320, 64), nn.ReLU() ).float().to(set_device(use_gpu))

      self.critic_layer         = nn.Sequential( nn.Linear(64, 1) ).float().to(set_device(use_gpu))
    def __init__(self, state_dim, action_dim, use_gpu = True):
      super(Policy_Model, self).__init__()

      self.std                  = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu))

      self.conv                 = CnnModel().float().to(set_device(use_gpu))
      self.memory_layer         = nn.LSTM(256, 256).float().to(set_device(use_gpu))

      self.state_extractor      = nn.Sequential( nn.Linear(2, 64), nn.ReLU() ).float().to(set_device(use_gpu))      
      self.nn_layer             = nn.Sequential( nn.Linear(320, 64), nn.ReLU() ).float().to(set_device(use_gpu))

      self.critic_layer         = nn.Sequential( nn.Linear(64, 1) ).float().to(set_device(use_gpu))
      self.actor_tanh_layer     = nn.Sequential( nn.Linear(64, 1), nn.Tanh() ).float().to(set_device(use_gpu))
      self.actor_sigmoid_layer  = nn.Sequential( nn.Linear(64, 2), nn.Sigmoid() ).float().to(set_device(use_gpu))            
    def __init__(self, state_dim, action_dim, use_gpu=True):
        super(Policy_Model, self).__init__()

        self.nn_layer = nn.Sequential(nn.Linear(state_dim, 640), nn.ReLU(),
                                      nn.Linear(640,
                                                640), nn.ReLU()).float().to(
                                                    set_device(use_gpu))

        self.actor_layer = nn.Sequential(nn.Linear(640, action_dim),
                                         nn.Softmax(-1)).float().to(
                                             set_device(use_gpu))

        self.critic_layer = nn.Sequential(nn.Linear(640, 1)).float().to(
            set_device(use_gpu))
    def __init__(self, state_dim, use_gpu=True):
        super(Value_Model, self).__init__()

        self.nn_layer = nn.Sequential(nn.Linear(state_dim, 256), nn.ReLU(),
                                      nn.Linear(256, 128), nn.ReLU(),
                                      nn.Linear(128, 1)).float().to(
                                          set_device(use_gpu))
Esempio n. 7
0
    def logprob(self, datas, value_data):
        mean, std = datas

        distribution    = Normal(mean, std)
        old_logprob     = distribution.log_prob(value_data).float().to(set_device(self.use_gpu))

        return old_logprob - (1.0 - value_data.sigmoid().pow(2)).log()
    def kldivergence(self, datas1, datas2):
        alpha1, beta1 = datas1
        alpha2, beta2 = datas2

        distribution1 = Beta(alpha1, beta1)
        distribution2 = Beta(alpha2, beta2)
        return kl_divergence(distribution1, distribution2).float().to(set_device(self.use_gpu))
Esempio n. 9
0
    def kldivergence(self, datas1, datas2):
        mean1, std1 = datas1
        mean2, std2 = datas2

        distribution1 = Normal(mean1, std1)
        distribution2 = Normal(mean2, std2)
        return kl_divergence(distribution1, distribution2).float().to(set_device(self.use_gpu))
    def sample(self, datas):
        alpha, beta = datas

        distribution    = Beta(alpha, beta)
        action          = distribution.sample().float().to(set_device(self.use_gpu))

        return action
    def sample(self, datas):
        mean, std = datas

        distribution = MultivariateNormal(mean, std)
        action = distribution.sample().squeeze(0).float().to(
            set_device(self.use_gpu))
        return action
    def __init__(self, state_dim, action_dim, use_gpu=True):
        super(Q_Model, self).__init__()

        self.nn_layer = nn.Sequential(nn.Linear(state_dim + action_dim, 256),
                                      nn.ReLU(), nn.Linear(256, 64), nn.ReLU(),
                                      nn.Linear(64, 1)).float().to(
                                          set_device(use_gpu))
    def compute_loss(self, first_encoded, second_encoded):
        indexes = torch.arange(first_encoded.shape[0]).long().to(
            set_device(self.use_gpu))

        similarity = torch.nn.functional.cosine_similarity(
            first_encoded.unsqueeze(1), second_encoded.unsqueeze(0), dim=2)
        return torch.nn.functional.cross_entropy(similarity, indexes)
Esempio n. 14
0
    def __init__(self, state_dim, action_dim, use_gpu=True):
        super(Policy_Model, self).__init__()

        self.basic_model = Basic_Model().float().to(set_device(use_gpu))

        self.actor_tanh_layer = nn.Sequential(nn.Linear(128, 1),
                                              nn.Tanh()).float().to(
                                                  set_device(use_gpu))

        self.actor_sigmoid_layer = nn.Sequential(nn.Linear(128, 2),
                                                 nn.Sigmoid()).float().to(
                                                     set_device(use_gpu))

        self.critic_layer = nn.Sequential(nn.Linear(128, 1)).float().to(
            set_device(use_gpu))

        self.std = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu))
    def __init__(self, state_dim, action_dim, use_gpu=True):
        super(Policy_Model, self).__init__()

        self.nn_layer = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
        ).float().to(set_device(use_gpu))

        self.actor_layer = nn.Sequential(nn.Linear(128, action_dim),
                                         nn.Tanh()).float().to(
                                             set_device(use_gpu))

        self.critic_layer = nn.Sequential(nn.Linear(128, 1)).float().to(
            set_device(use_gpu))

        self.std = torch.FloatTensor([1.0]).to(set_device(use_gpu))
    def compute_loss(self, first_encoded, second_encoded):
        indexes     = torch.arange(first_encoded.shape[0]).long().to(set_device(self.use_gpu))   
        
        similarity  = torch.mm(first_encoded, second_encoded.t())
        
        loss1       = torch.nn.functional.cross_entropy(similarity, indexes)
        loss2       = torch.nn.functional.cross_entropy(similarity.t(), indexes)

        return (loss1 + loss2) / 2.0
        
Esempio n. 17
0
    def __init__(self, state_dim, action_dim, use_gpu = True):
        super(Policy_Model, self).__init__()

        self.nn_layer = nn.Sequential(
          nn.Linear(state_dim, 256),
          nn.ReLU(),
          nn.Linear(256, 128),
          nn.ReLU(),
        ).float().to(set_device(use_gpu))

        self.actor_layer = nn.Sequential(
          nn.Linear(64, action_dim),
          nn.Tanh()
        ).float().to(set_device(use_gpu))

        self.actor_std_layer = nn.Sequential(
          nn.Linear(64, action_dim),
          nn.Sigmoid()
        ).float().to(set_device(use_gpu))
    def __init__(self, state_dim, action_dim, use_gpu=True):
        super(Policy_Model, self).__init__()

        self.nn_layer = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 96),
            nn.ReLU(),
        ).float().to(set_device(use_gpu))

        self.actor_alpha_layer = nn.Sequential(nn.Linear(32, action_dim),
                                               nn.Softplus()).float().to(
                                                   set_device(use_gpu))

        self.actor_beta_layer = nn.Sequential(nn.Linear(32, action_dim),
                                              nn.Softplus()).float().to(
                                                  set_device(use_gpu))

        self.critic_layer = nn.Sequential(nn.Linear(32, 1)).float().to(
            set_device(use_gpu))
    def __init__(self, state_dim, action_dim, use_gpu=True):
        super(PolicyModel, self).__init__()

        self.std = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu))

        self.state_extractor = nn.Sequential(nn.Linear(2, 32), nn.ReLU())
        self.image_extractor = nn.LSTM(128, 128)

        self.nn_layer = nn.Sequential(nn.Linear(160, 192), nn.ReLU())

        self.actor_steer = nn.Sequential(nn.Linear(64, 1), nn.Tanh())
        self.actor_gas_break = nn.Sequential(nn.Linear(64, 2), nn.Sigmoid())

        self.critic_layer = nn.Sequential(nn.Linear(64, 1))
Esempio n. 20
0
    def __init__(self,
                 soft_q1,
                 soft_q2,
                 policy,
                 state_dim,
                 action_dim,
                 distribution,
                 q_loss,
                 policy_loss,
                 memory,
                 soft_q_optimizer,
                 policy_optimizer,
                 is_training_mode=True,
                 batch_size=32,
                 epochs=1,
                 soft_tau=0.95,
                 folder='model',
                 use_gpu=True):

        self.batch_size = batch_size
        self.is_training_mode = is_training_mode
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.folder = folder
        self.use_gpu = use_gpu
        self.epochs = epochs
        self.soft_tau = soft_tau

        self.policy = policy
        self.soft_q1 = soft_q1
        self.soft_q2 = soft_q2

        self.target_policy = deepcopy(self.policy)
        self.target_soft_q2 = deepcopy(self.soft_q2)
        self.target_soft_q1 = deepcopy(self.soft_q1)

        self.distribution = distribution
        self.memory = memory

        self.qLoss = q_loss
        self.policyLoss = policy_loss

        self.device = set_device(self.use_gpu)
        self.q_update = 1

        self.soft_q_optimizer = soft_q_optimizer
        self.policy_optimizer = policy_optimizer

        self.soft_q_scaler = torch.cuda.amp.GradScaler()
        self.policy_scaler = torch.cuda.amp.GradScaler()
 def compute_loss(self, logits):
     indexes = torch.arange(logits.shape[0]).long().to(
         set_device(self.use_gpu))
     return torch.nn.functional.cross_entropy(logits, indexes)
Esempio n. 22
0
    def entropy(self, datas):
        mean, std = datas

        distribution = Normal(mean, std)
        return distribution.entropy().float().to(set_device(self.use_gpu))
Esempio n. 23
0
    def sample(self, datas):
        mean, std = datas

        distribution = Normal(torch.zeros_like(mean), torch.ones_like(std))
        rand = distribution.sample().float().to(set_device(self.use_gpu))
        return mean + std * rand
Esempio n. 24
0
else:
    print('continous')

if action_dim is None:
    action_dim = environment.get_action_dim()
print('action_dim: ', action_dim)

redis_obj = redis.Redis()

agent_memory = Policy_Memory(redis_obj, capacity=capacity)
runner_memory = Policy_Memory(redis_obj, capacity=capacity)
q_loss = Q_loss()
policy_loss = Policy_loss()

policy = Policy_Model(state_dim, action_dim,
                      use_gpu).float().to(set_device(use_gpu))
soft_q1 = Q_Model(state_dim, action_dim).float().to(set_device(use_gpu))
soft_q2 = Q_Model(state_dim, action_dim).float().to(set_device(use_gpu))
policy_optimizer = Adam(list(policy.parameters()), lr=learning_rate)
soft_q_optimizer = Adam(list(soft_q1.parameters()) +
                        list(soft_q2.parameters()),
                        lr=learning_rate)

agent = Agent(soft_q1, soft_q2, policy, state_dim, action_dim, q_loss,
              policy_loss, agent_memory, soft_q_optimizer, policy_optimizer,
              is_training_mode, batch_size, epochs, soft_tau, folder, use_gpu)

runner = Runner(
    agent, environment, runner_memory, is_training_mode, render,
    environment.is_discrete(), max_action, SummaryWriter(), n_plot_batch
)  # Runner(agent, environment, runner_memory, is_training_mode, render, n_update, environment.is_discrete(), max_action, SummaryWriter(), n_plot_batch) # [Runner.remote(i_env, render, training_mode, n_update, Wrapper.is_discrete(), agent, max_action, None, n_plot_batch) for i_env in env]
    def entropy(self, datas):
        alpha, beta = datas

        distribution = Beta(alpha, beta)
        return distribution.entropy().float().to(set_device(self.use_gpu))
    action_dim = environment.get_action_dim()
print('action_dim: ', action_dim)

policy_dist = Policy_Dist(use_gpu)
advantage_function = Advantage_Function(gamma)
aux_ppg_memory = Aux_Memory()
ppo_memory = Policy_Memory()
runner_memory = Policy_Memory()
aux_clr_memory = Clr_Memory()
aux_ppg_loss = Aux_loss(policy_dist)
ppo_loss = Policy_loss(policy_dist, advantage_function, policy_kl_range,
                       policy_params, value_clip, vf_loss_coef, entropy_coef,
                       gamma)
aux_clr_loss = Clr_loss(use_gpu)

cnn = Cnn_Model().float().to(set_device(use_gpu))
policy = Policy_Model(state_dim, action_dim,
                      use_gpu).float().to(set_device(use_gpu))
value = Value_Model(state_dim).float().to(set_device(use_gpu))
projector = Projection_Model().float().to(set_device(use_gpu))
ppo_optimizer = Adam(list(policy.parameters()) + list(value.parameters()) +
                     list(cnn.parameters()),
                     lr=learning_rate)
aux_ppg_optimizer = Adam(list(policy.parameters()), lr=learning_rate)
aux_clr_optimizer = Adam(list(cnn.parameters()) + list(projector.parameters()),
                         lr=learning_rate)

agent = Agent(projector, cnn, policy, value, state_dim, action_dim,
              policy_dist, ppo_loss, aux_ppg_loss, aux_clr_loss, ppo_memory,
              aux_ppg_memory, aux_clr_memory, ppo_optimizer, aux_ppg_optimizer,
              aux_clr_optimizer, ppo_epochs, aux_ppg_epochs, aux_clr_epochs,
if action_dim is None:
    action_dim = environment.get_action_dim()
print('action_dim: ', action_dim)

redis_obj           = redis.Redis()
policy_dist         = Policy_Dist(use_gpu)

advantage_function  = Advantage_Function(gamma)
aux_ppg_loss        = Aux_loss(policy_dist)
aux_ppg_memory      = Aux_Memory()
ppo_loss            = Policy_loss(policy_dist, advantage_function, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma)
ppo_memory          = Policy_Memory()
child_runner_memory = Policy_Memory()

policy              = Policy_Model(state_dim, action_dim, use_gpu).float().to(set_device(use_gpu))
value               = Value_Model(state_dim).float().to(set_device(use_gpu))
ppo_optimizer       = Adam(list(policy.parameters()) + list(value.parameters()), lr = learning_rate)        
aux_ppg_optimizer   = Adam(list(policy.parameters()), lr = learning_rate)

child_agent         = AgentChild(policy, value, state_dim, action_dim, policy_dist, ppo_loss, aux_ppg_loss, ppo_memory, aux_ppg_memory, 
                        ppo_optimizer, aux_ppg_optimizer, PPO_epochs, Aux_epochs, n_aux_update, is_training_mode, policy_kl_range, 
                        policy_params, value_clip, entropy_coef, vf_loss_coef, batch_size,  folder, use_gpu = True)

cql_memory          = Policy_Memory()
cql_loss            = Cql_loss()
offpolicy_loss      = OffPolicy_loss()
central_runner_memory   = Policy_Memory()

offpolicy           = Policy_Model(state_dim, action_dim, use_gpu).float().to(set_device(use_gpu))
soft_q1             = Q_Model(state_dim, action_dim).float().to(set_device(use_gpu))
    def logprob(self, datas, value_data):
        alpha, beta = datas

        distribution = Beta(alpha, beta)
        return distribution.log_prob(value_data).float().to(set_device(self.use_gpu))
    def logprob(self, datas, value_data):
        mean, std = datas

        distribution = MultivariateNormal(mean, std)
        return distribution.log_prob(value_data).float().to(
            set_device(self.use_gpu))
 def kldivergence(self, datas1, datas2):
     distribution1 = Categorical(datas1)
     distribution2 = Categorical(datas2)
     return kl_divergence(distribution1, distribution2).unsqueeze(1).float().to(set_device(self.use_gpu))