def __init__(self, state_dim, use_gpu=True): super(Value_Model, self).__init__() self.basic_model = Basic_Model().float().to(set_device(use_gpu)) self.critic_layer = nn.Sequential(nn.Linear(128, 1)).float().to( set_device(use_gpu))
def __init__(self, use_gpu=True): super(Basic_Model, self).__init__() self.conv1 = nn.Sequential( DepthwiseSeparableConv2d(3, 16, kernel_size=3, stride=1, padding=1), nn.ReLU(), DepthwiseSeparableConv2d(16, 32, kernel_size=4, stride=2, padding=1), nn.ReLU(), DepthwiseSeparableConv2d(32, 64, kernel_size=4, stride=2, padding=1), nn.ReLU(), ).float().to(set_device(use_gpu)) self.conv2 = nn.Sequential( DepthwiseSeparableConv2d(64, 128, kernel_size=4, stride=2, padding=1), nn.ReLU(), DepthwiseSeparableConv2d(128, 256, kernel_size=4, stride=2, padding=1), nn.ReLU(), ).float().to(set_device(use_gpu)) self.conv3 = nn.Sequential( DepthwiseSeparableConv2d(64, 128, kernel_size=8, stride=4, padding=2), nn.ReLU(), DepthwiseSeparableConv2d(128, 256, kernel_size=3, stride=1, padding=1), nn.ReLU(), ).float().to(set_device(use_gpu)) self.state_extractor = nn.Sequential(nn.Linear(1, 64), nn.ReLU()).float().to( set_device(use_gpu)) self.nn_layer = nn.Sequential( nn.Linear(320, 128), nn.ReLU(), ).float().to(set_device(use_gpu))
def __init__(self, state_dim, use_gpu = True): super(Value_Model, self).__init__() self.conv = CnnModel().float().to(set_device(use_gpu)) self.memory_layer = nn.LSTM(256, 256).float().to(set_device(use_gpu)) self.state_extractor = nn.Sequential( nn.Linear(2, 64), nn.ReLU() ).float().to(set_device(use_gpu)) self.nn_layer = nn.Sequential( nn.Linear(320, 64), nn.ReLU() ).float().to(set_device(use_gpu)) self.critic_layer = nn.Sequential( nn.Linear(64, 1) ).float().to(set_device(use_gpu))
def __init__(self, state_dim, action_dim, use_gpu = True): super(Policy_Model, self).__init__() self.std = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu)) self.conv = CnnModel().float().to(set_device(use_gpu)) self.memory_layer = nn.LSTM(256, 256).float().to(set_device(use_gpu)) self.state_extractor = nn.Sequential( nn.Linear(2, 64), nn.ReLU() ).float().to(set_device(use_gpu)) self.nn_layer = nn.Sequential( nn.Linear(320, 64), nn.ReLU() ).float().to(set_device(use_gpu)) self.critic_layer = nn.Sequential( nn.Linear(64, 1) ).float().to(set_device(use_gpu)) self.actor_tanh_layer = nn.Sequential( nn.Linear(64, 1), nn.Tanh() ).float().to(set_device(use_gpu)) self.actor_sigmoid_layer = nn.Sequential( nn.Linear(64, 2), nn.Sigmoid() ).float().to(set_device(use_gpu))
def __init__(self, state_dim, action_dim, use_gpu=True): super(Policy_Model, self).__init__() self.nn_layer = nn.Sequential(nn.Linear(state_dim, 640), nn.ReLU(), nn.Linear(640, 640), nn.ReLU()).float().to( set_device(use_gpu)) self.actor_layer = nn.Sequential(nn.Linear(640, action_dim), nn.Softmax(-1)).float().to( set_device(use_gpu)) self.critic_layer = nn.Sequential(nn.Linear(640, 1)).float().to( set_device(use_gpu))
def __init__(self, state_dim, use_gpu=True): super(Value_Model, self).__init__() self.nn_layer = nn.Sequential(nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU(), nn.Linear(128, 1)).float().to( set_device(use_gpu))
def logprob(self, datas, value_data): mean, std = datas distribution = Normal(mean, std) old_logprob = distribution.log_prob(value_data).float().to(set_device(self.use_gpu)) return old_logprob - (1.0 - value_data.sigmoid().pow(2)).log()
def kldivergence(self, datas1, datas2): alpha1, beta1 = datas1 alpha2, beta2 = datas2 distribution1 = Beta(alpha1, beta1) distribution2 = Beta(alpha2, beta2) return kl_divergence(distribution1, distribution2).float().to(set_device(self.use_gpu))
def kldivergence(self, datas1, datas2): mean1, std1 = datas1 mean2, std2 = datas2 distribution1 = Normal(mean1, std1) distribution2 = Normal(mean2, std2) return kl_divergence(distribution1, distribution2).float().to(set_device(self.use_gpu))
def sample(self, datas): alpha, beta = datas distribution = Beta(alpha, beta) action = distribution.sample().float().to(set_device(self.use_gpu)) return action
def sample(self, datas): mean, std = datas distribution = MultivariateNormal(mean, std) action = distribution.sample().squeeze(0).float().to( set_device(self.use_gpu)) return action
def __init__(self, state_dim, action_dim, use_gpu=True): super(Q_Model, self).__init__() self.nn_layer = nn.Sequential(nn.Linear(state_dim + action_dim, 256), nn.ReLU(), nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 1)).float().to( set_device(use_gpu))
def compute_loss(self, first_encoded, second_encoded): indexes = torch.arange(first_encoded.shape[0]).long().to( set_device(self.use_gpu)) similarity = torch.nn.functional.cosine_similarity( first_encoded.unsqueeze(1), second_encoded.unsqueeze(0), dim=2) return torch.nn.functional.cross_entropy(similarity, indexes)
def __init__(self, state_dim, action_dim, use_gpu=True): super(Policy_Model, self).__init__() self.basic_model = Basic_Model().float().to(set_device(use_gpu)) self.actor_tanh_layer = nn.Sequential(nn.Linear(128, 1), nn.Tanh()).float().to( set_device(use_gpu)) self.actor_sigmoid_layer = nn.Sequential(nn.Linear(128, 2), nn.Sigmoid()).float().to( set_device(use_gpu)) self.critic_layer = nn.Sequential(nn.Linear(128, 1)).float().to( set_device(use_gpu)) self.std = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu))
def __init__(self, state_dim, action_dim, use_gpu=True): super(Policy_Model, self).__init__() self.nn_layer = nn.Sequential( nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU(), ).float().to(set_device(use_gpu)) self.actor_layer = nn.Sequential(nn.Linear(128, action_dim), nn.Tanh()).float().to( set_device(use_gpu)) self.critic_layer = nn.Sequential(nn.Linear(128, 1)).float().to( set_device(use_gpu)) self.std = torch.FloatTensor([1.0]).to(set_device(use_gpu))
def compute_loss(self, first_encoded, second_encoded): indexes = torch.arange(first_encoded.shape[0]).long().to(set_device(self.use_gpu)) similarity = torch.mm(first_encoded, second_encoded.t()) loss1 = torch.nn.functional.cross_entropy(similarity, indexes) loss2 = torch.nn.functional.cross_entropy(similarity.t(), indexes) return (loss1 + loss2) / 2.0
def __init__(self, state_dim, action_dim, use_gpu = True): super(Policy_Model, self).__init__() self.nn_layer = nn.Sequential( nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU(), ).float().to(set_device(use_gpu)) self.actor_layer = nn.Sequential( nn.Linear(64, action_dim), nn.Tanh() ).float().to(set_device(use_gpu)) self.actor_std_layer = nn.Sequential( nn.Linear(64, action_dim), nn.Sigmoid() ).float().to(set_device(use_gpu))
def __init__(self, state_dim, action_dim, use_gpu=True): super(Policy_Model, self).__init__() self.nn_layer = nn.Sequential( nn.Linear(state_dim, 128), nn.ReLU(), nn.Linear(128, 96), nn.ReLU(), ).float().to(set_device(use_gpu)) self.actor_alpha_layer = nn.Sequential(nn.Linear(32, action_dim), nn.Softplus()).float().to( set_device(use_gpu)) self.actor_beta_layer = nn.Sequential(nn.Linear(32, action_dim), nn.Softplus()).float().to( set_device(use_gpu)) self.critic_layer = nn.Sequential(nn.Linear(32, 1)).float().to( set_device(use_gpu))
def __init__(self, state_dim, action_dim, use_gpu=True): super(PolicyModel, self).__init__() self.std = torch.FloatTensor([1.0, 0.5, 0.5]).to(set_device(use_gpu)) self.state_extractor = nn.Sequential(nn.Linear(2, 32), nn.ReLU()) self.image_extractor = nn.LSTM(128, 128) self.nn_layer = nn.Sequential(nn.Linear(160, 192), nn.ReLU()) self.actor_steer = nn.Sequential(nn.Linear(64, 1), nn.Tanh()) self.actor_gas_break = nn.Sequential(nn.Linear(64, 2), nn.Sigmoid()) self.critic_layer = nn.Sequential(nn.Linear(64, 1))
def __init__(self, soft_q1, soft_q2, policy, state_dim, action_dim, distribution, q_loss, policy_loss, memory, soft_q_optimizer, policy_optimizer, is_training_mode=True, batch_size=32, epochs=1, soft_tau=0.95, folder='model', use_gpu=True): self.batch_size = batch_size self.is_training_mode = is_training_mode self.action_dim = action_dim self.state_dim = state_dim self.folder = folder self.use_gpu = use_gpu self.epochs = epochs self.soft_tau = soft_tau self.policy = policy self.soft_q1 = soft_q1 self.soft_q2 = soft_q2 self.target_policy = deepcopy(self.policy) self.target_soft_q2 = deepcopy(self.soft_q2) self.target_soft_q1 = deepcopy(self.soft_q1) self.distribution = distribution self.memory = memory self.qLoss = q_loss self.policyLoss = policy_loss self.device = set_device(self.use_gpu) self.q_update = 1 self.soft_q_optimizer = soft_q_optimizer self.policy_optimizer = policy_optimizer self.soft_q_scaler = torch.cuda.amp.GradScaler() self.policy_scaler = torch.cuda.amp.GradScaler()
def compute_loss(self, logits): indexes = torch.arange(logits.shape[0]).long().to( set_device(self.use_gpu)) return torch.nn.functional.cross_entropy(logits, indexes)
def entropy(self, datas): mean, std = datas distribution = Normal(mean, std) return distribution.entropy().float().to(set_device(self.use_gpu))
def sample(self, datas): mean, std = datas distribution = Normal(torch.zeros_like(mean), torch.ones_like(std)) rand = distribution.sample().float().to(set_device(self.use_gpu)) return mean + std * rand
else: print('continous') if action_dim is None: action_dim = environment.get_action_dim() print('action_dim: ', action_dim) redis_obj = redis.Redis() agent_memory = Policy_Memory(redis_obj, capacity=capacity) runner_memory = Policy_Memory(redis_obj, capacity=capacity) q_loss = Q_loss() policy_loss = Policy_loss() policy = Policy_Model(state_dim, action_dim, use_gpu).float().to(set_device(use_gpu)) soft_q1 = Q_Model(state_dim, action_dim).float().to(set_device(use_gpu)) soft_q2 = Q_Model(state_dim, action_dim).float().to(set_device(use_gpu)) policy_optimizer = Adam(list(policy.parameters()), lr=learning_rate) soft_q_optimizer = Adam(list(soft_q1.parameters()) + list(soft_q2.parameters()), lr=learning_rate) agent = Agent(soft_q1, soft_q2, policy, state_dim, action_dim, q_loss, policy_loss, agent_memory, soft_q_optimizer, policy_optimizer, is_training_mode, batch_size, epochs, soft_tau, folder, use_gpu) runner = Runner( agent, environment, runner_memory, is_training_mode, render, environment.is_discrete(), max_action, SummaryWriter(), n_plot_batch ) # Runner(agent, environment, runner_memory, is_training_mode, render, n_update, environment.is_discrete(), max_action, SummaryWriter(), n_plot_batch) # [Runner.remote(i_env, render, training_mode, n_update, Wrapper.is_discrete(), agent, max_action, None, n_plot_batch) for i_env in env]
def entropy(self, datas): alpha, beta = datas distribution = Beta(alpha, beta) return distribution.entropy().float().to(set_device(self.use_gpu))
action_dim = environment.get_action_dim() print('action_dim: ', action_dim) policy_dist = Policy_Dist(use_gpu) advantage_function = Advantage_Function(gamma) aux_ppg_memory = Aux_Memory() ppo_memory = Policy_Memory() runner_memory = Policy_Memory() aux_clr_memory = Clr_Memory() aux_ppg_loss = Aux_loss(policy_dist) ppo_loss = Policy_loss(policy_dist, advantage_function, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma) aux_clr_loss = Clr_loss(use_gpu) cnn = Cnn_Model().float().to(set_device(use_gpu)) policy = Policy_Model(state_dim, action_dim, use_gpu).float().to(set_device(use_gpu)) value = Value_Model(state_dim).float().to(set_device(use_gpu)) projector = Projection_Model().float().to(set_device(use_gpu)) ppo_optimizer = Adam(list(policy.parameters()) + list(value.parameters()) + list(cnn.parameters()), lr=learning_rate) aux_ppg_optimizer = Adam(list(policy.parameters()), lr=learning_rate) aux_clr_optimizer = Adam(list(cnn.parameters()) + list(projector.parameters()), lr=learning_rate) agent = Agent(projector, cnn, policy, value, state_dim, action_dim, policy_dist, ppo_loss, aux_ppg_loss, aux_clr_loss, ppo_memory, aux_ppg_memory, aux_clr_memory, ppo_optimizer, aux_ppg_optimizer, aux_clr_optimizer, ppo_epochs, aux_ppg_epochs, aux_clr_epochs,
if action_dim is None: action_dim = environment.get_action_dim() print('action_dim: ', action_dim) redis_obj = redis.Redis() policy_dist = Policy_Dist(use_gpu) advantage_function = Advantage_Function(gamma) aux_ppg_loss = Aux_loss(policy_dist) aux_ppg_memory = Aux_Memory() ppo_loss = Policy_loss(policy_dist, advantage_function, policy_kl_range, policy_params, value_clip, vf_loss_coef, entropy_coef, gamma) ppo_memory = Policy_Memory() child_runner_memory = Policy_Memory() policy = Policy_Model(state_dim, action_dim, use_gpu).float().to(set_device(use_gpu)) value = Value_Model(state_dim).float().to(set_device(use_gpu)) ppo_optimizer = Adam(list(policy.parameters()) + list(value.parameters()), lr = learning_rate) aux_ppg_optimizer = Adam(list(policy.parameters()), lr = learning_rate) child_agent = AgentChild(policy, value, state_dim, action_dim, policy_dist, ppo_loss, aux_ppg_loss, ppo_memory, aux_ppg_memory, ppo_optimizer, aux_ppg_optimizer, PPO_epochs, Aux_epochs, n_aux_update, is_training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef, batch_size, folder, use_gpu = True) cql_memory = Policy_Memory() cql_loss = Cql_loss() offpolicy_loss = OffPolicy_loss() central_runner_memory = Policy_Memory() offpolicy = Policy_Model(state_dim, action_dim, use_gpu).float().to(set_device(use_gpu)) soft_q1 = Q_Model(state_dim, action_dim).float().to(set_device(use_gpu))
def logprob(self, datas, value_data): alpha, beta = datas distribution = Beta(alpha, beta) return distribution.log_prob(value_data).float().to(set_device(self.use_gpu))
def logprob(self, datas, value_data): mean, std = datas distribution = MultivariateNormal(mean, std) return distribution.log_prob(value_data).float().to( set_device(self.use_gpu))
def kldivergence(self, datas1, datas2): distribution1 = Categorical(datas1) distribution2 = Categorical(datas2) return kl_divergence(distribution1, distribution2).unsqueeze(1).float().to(set_device(self.use_gpu))