def load_models(self, episode): """ loads the target actor and critic models, and copies them onto actor and critic models :param episode: the count of episodes iterated (used to find the file name) :return: ## NOTE: SOURCE MODELS ARE SAVED NOW, NOT TARGET """ self.actor.load_state_dict( torch.load('./Models/' + str(episode) + '_actor_' + str(version) + '.pt')) self.critic.load_state_dict( torch.load('./Models/' + str(episode) + '_critic_' + str(version) + '.pt')) utils.hard_update(self.target_actor, self.actor) utils.hard_update(self.target_critic, self.critic) print 'Models loaded succesfully'
def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic)
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") # Q network, which yields a certain value for (a_t | s_t) pair self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) # a sort of a replica - since, due to Bellman recursive definition, Q network learns from itself- and its unstbale self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) # the start point is same weights in both networks. hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # todo: crunch on this automatic alpha update # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) # instanciating of policy - given a state it produces probabilities for actions self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False # todo: what's difference between deterministic to Gaussian self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def load_models(self, name, test=False): """ loads the target actor and critic models, and copies them onto actor and critic models """ saved_state = torch.load(name, map_location=lambda storage, loc: storage) if test: saved_state = { name: param for name, param in saved_state.items() if 'actor' in name } strict = False else: strict = True self.AC_T.load_state_dict(saved_state, strict=strict) utils.hard_update(self.AC, self.AC_T) print('Models loaded succesfully')
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma #γ self.tau = args.tau #τ self.alpha = args.alpha #α self.policy_type = args.policy #策略类型,高斯随机策略、确定性策略 self.target_update_interval = args.target_update_interval #target network更新间隔 self.automatic_entropy_tuning = args.automatic_entropy_tuning #自动调熵 self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to( device=self.device) #Critic Network,Q网络 self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to( self.device) #Target Q Network hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to( self.device)).item() #torch.prod(input) : 返回所有元素的乘积 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") #Similar to Double-QNetwork self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) #The two networks are with the same initialization #Two option policy, stochastic(Gaussian) or Deterministic if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, num_frame_obs, num_goal_obs, num_vel_obs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning #self.device = torch.device("cuda" if args.cuda else "cpu") #self.device = torch.device("cuda") self.device = torch.device("cuda") self.action_space_array = np.array(action_space) self.action_space = action_space self.critic_1 = QNetwork_1(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_1_optim = Adam(self.critic_1.parameters(), lr=args.lr) self.critic_1_target = QNetwork_1(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_1_target, self.critic_1) self.critic_2 = QNetwork_2(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_2_optim = Adam(self.critic_2.parameters(), lr=args.lr) self.critic_2_target = QNetwork_2(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_2_target, self.critic_2) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod(torch.Tensor(self.action_space_array.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) #self.policy = GaussianPolicy(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space_array.shape[0], args.hidden_size, self.action_space_array).to(self.device) self.policy = GaussianPolicy(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size, self.action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_frame_obs, num_goal_obs, num_vel_obs, self.action_space.shape[0], args.hidden_size, self.action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning tmp_device = 'cuda' if torch.cuda.is_available() else 'cpu' self.device = torch.device(tmp_device) self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, act_sp, ob_sp, all_obs, all_acts, hidden_dim=64, start_steps=10000, update_after=1000, update_every=50): self.lr = 1e-2 self.target_noise = 0.2 self.target_noise_clip = 0.3 self.act_noise = 0.1 self.act_sp = act_sp self.ob_sp = ob_sp self.start_steps = start_steps self.update_after = update_after self.update_every = update_every # self.start_steps = 1 # self.update_after = 1 # self.update_every = 2 print(f"act_sp: {act_sp}, ob_sp: {ob_sp}, all_obs: {all_obs}, all_acts: {all_acts}") self.policy = MLPNetwork(ob_sp, act_sp, constrain_out=True, discrete_action=False, td3_policy=True, hidden_dim=hidden_dim).to(device) self.policy_targ = MLPNetwork(ob_sp, act_sp, constrain_out=True, discrete_action=False, td3_policy=True, hidden_dim=hidden_dim).to(device) self.q_nets_n = 2 self.qnets = [] self.qnet_targs = [] self.q_optimizers = [] for i in range(self.q_nets_n): qnet = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device) qnet_targ = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device) qnet.to(device) qnet_targ.to(device) hard_update(qnet_targ, qnet) self.qnets.append(qnet) self.qnet_targs.append(qnet_targ) self.q_optimizers.append(optim.Adam(qnet.parameters(), lr=self.lr)) self.policy.to(device) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=self.lr) self.policy_targ.to(device) self.p_targ_optimizer = optim.Adam(self.policy_targ.parameters(), lr=self.lr) self.action_count = 0 self.use_warmup = True
def __init__(self, state_size, action_size, tau, lr_actor, lr_critic, num_agents, agent_idx, seed, device, gamma, tensorboard_writer=None): self.state_size = state_size self.action_size = action_size self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.num_agents = num_agents self.agent_idx = agent_idx self.seed = seed self.device = device self.gamma = gamma random.seed(seed) self.tensorboard_writer = tensorboard_writer self.actor_local = Actor(state_size, action_size, seed) self.actor_target = Actor(state_size, action_size, seed) critic_state_size = (state_size + action_size) * num_agents self.critic_local = Critic(critic_state_size, seed) self.critic_target = Critic(critic_state_size, seed) hard_update(self.actor_local, self.actor_target) hard_update(self.critic_local, self.critic_target) self.actor_optim = torch.optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optim = torch.optim.Adam(self.critic_local.parameters(), lr=lr_critic) self.noise = OUNoise(action_size, seed) self.iteration = 0
def __init__(self, num_inputs, action_space, variant): self.gamma = variant['gamma'] self.tau = variant['tau'] self.alpha = variant['alpha'] self.policy_type = variant['policy_type'] self.target_update_interval = variant['target_update_interval'] self.automatic_entropy_tuning = variant['automatic_entropy_tuning'] self.lr = variant.get("lr", 1e-3) self.device = torch.device("cuda" if variant['cuda'] else "cpu") self.hidden_size = variant.get('hidden_size', [128, 128]) self.critic = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == 'Gaussian': if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)
def __init__(self, act_sp, ob_sp, all_obs, all_acts, hidden_dim=64): self.act_sp = act_sp self.ob_sp = ob_sp # print(ob_sp) print(f"ob_sp: {ob_sp} act_sp: {act_sp}") self.policy = MLPNetwork(ob_sp, act_sp, constrain_out=True, hidden_dim=hidden_dim).to(device) self.policy_targ = MLPNetwork(ob_sp, act_sp, constrain_out=True, hidden_dim=hidden_dim).to(device) self.qnet = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device) self.qnet_targ = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device) self.policy.to(device) self.qnet.to(device) self.policy_targ.to(device) self.qnet_targ.to(device) hard_update(self.policy_targ, self.policy) hard_update(self.qnet_targ, self.qnet) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR) self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
def __init__(self, obs_space, action_space, ram, writer, device, args): """ :param obs_space: Dimensions of state (int) :param action_space: Dimension of action (int) :param ram: replay memory buffer object :return: """ self.state_dim = obs_space.shape[0] self.action_dim = action_space.shape[0] self.action_high = action_space.high self.action_low = action_space.low self.ram = ram self.iter = 1 self.steps = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.decay_rate = args.decay_rate self.eps_start = args.eps_start self.eps_end = args.eps_end self.eps_decay = args.eps_decay self.start_step = args.start_learning self.device = device self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim) self.writer = writer self.args = args # init network target_net = DDPG(obs_space.shape, self.action_dim, args).to(device) learn_net = DDPG(obs_space.shape, self.action_dim, args).to(device) utils.hard_update(target_net, learn_net) self.AC = learn_net self.AC_T = target_net self.actor_optimizer = torch.optim.Adam( self.AC.actor.policyNet.parameters(), args.lr_a) self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(), args.lr_c) self.actor = self.AC.actor self.target_actor = self.AC_T.actor self.critic = self.AC.critic self.target_critic = self.AC_T.critic
def __init__(self, config): self.config = config self.online_actor = config.actor_fn().to(self.config.device) self.target_actor = config.actor_fn().to(self.config.device) self.online_actor_opt = config.actor_opt_fn( self.online_actor.parameters()) self.online_critic = config.critic_fn().to(self.config.device) self.target_critic = config.critic_fn().to(self.config.device) self.online_critic_opt = config.critic_opt_fn( self.online_critic.parameters()) self.noises = [ config.noise_fn() for _ in range(self.config.num_agents) ] self.replay = config.replay_fn() hard_update(self.target_actor, self.online_actor) # initialize to be equal hard_update(self.target_critic, self.online_critic) # initialize to be equal
def __init__(self, input_space, action_space, args): self.use_expert = args.use_expert self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.action_range = [action_space.low, action_space.high] self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning # self.device = torch.device("cuda" if args.cuda else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print(torch.cuda.is_available()) # print(torch.cuda.current_device()) # print(torch.cuda.device(0)) # print(torch.cuda.device_count()) # print(torch.cuda.get_device_name()) # print(torch.backends.cudnn.version()) # print(torch.backends.cudnn.is_available()) self.critic = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: raise ValueError("Not supper another type yet.")
def __init__(self, obs_space, action_space, ram): self.obs_dim = obs_space.shape[0] self.act_dim = action_space.n # only for discrete space self.ram = ram self.iter = 1 self.steps = 0 self.gamma = 0.90 self.batch_size = 64 self.initial_e = 0.5 self.end_e = 0.01 self.e = self.initial_e self.target_update_freq = 100 self.tau = 0.01 self.lr = 0.001 self.learning_net = DQN_Model.DQN(self.obs_dim, self.act_dim).cuda() self.target_net = DQN_Model.DQN(self.obs_dim, self.act_dim).cuda() utils.hard_update(self.target_net, self.learning_net) self.optimizer = torch.optim.Adam(self.learning_net.parameters(), self.lr) self.loss_f = nn.MSELoss()
def __init__(self, state_size, action_size, num_agents, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() self.actor = Actor(state_size, action_size).to(DEVICE) self.critic = Critic(state_size, action_size, num_agents).to(DEVICE) self.target_actor = Actor(state_size, action_size).to(DEVICE) self.target_critic = Critic(state_size, action_size, num_agents).to(DEVICE) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic)
def __init__(self, num_inputs, action_space, \ device, hidden_size, lr, gamma, tau, alpha): self.gamma = gamma self.tau = tau self.alpha = alpha self.device = device self.critic = QNetwork(num_inputs, action_space.shape[0], hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], \ hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=lr)
def __init__(self, obs_space, action_space, ram): self.obs_dim = obs_space.shape[0] self.act_dim = action_space.shape[0] # only for continiuous env # just for one action self.action_low = action_space.low[0] self.action_high = action_space.high[0] self.ram = ram self.iter = 1 self.steps = 0 self.gamma = 0.9 self.batch_size = 64 self.initial_e = 0.5 self.end_e = 0.01 self.e = self.initial_e self.start_training = 100 self.tau = 0.01 self.critic_lr = 0.001 self.actor_lr = 0.001 self.noise = utils.RandomActionNoise(self.act_dim) target_net = DDPG_Model.DDPG(self.obs_dim, self.act_dim).cuda() learning_net = DDPG_Model.DDPG(self.obs_dim, self.act_dim).cuda() utils.hard_update(target_net, learning_net) self.AC = learning_net self.AC_T = target_net self.actor = self.AC.actor self.critic = self.AC.critic self.actor_T = self.AC_T.actor self.critic_T = self.AC_T.critic self.actor_optimizer = torch.optim.Adam(self.AC.actor.parameters(), self.actor_lr) self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(), self.critic_lr) self.loss_f = nn.MSELoss()
def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list()
def __init__(self, nb_states, nb_actions, args): self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = args.discrete net_config = { 'hidden1' : args.hidden1, 'hidden2' : args.hidden2 } # Actor and Critic initialization self.actor = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr) self.critic = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr) hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # Replay Buffer and noise self.memory = ReplayBuffer(args.memory_size) self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)) self.last_state = None self.last_action = None # Hyper parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount # CUDA self.use_cuda = args.cuda if self.use_cuda: self.cuda()
def __init__(self, state_dim, action_dim, ram): """ Initialize actor and critic networks """ self.state_dim = state_dim self.action_dim = action_dim self.ram = ram self.iter = 0 self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim) self.actor = model.Actor(self.state_dim, self.action_dim) self.target_actor = model.Actor(self.state_dim, self.action_dim) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), LEARNING_RATE) self.critic = model.Critic(self.state_dim, self.action_dim) self.target_critic = model.Critic(self.state_dim, self.action_dim) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), LEARNING_RATE) # copy parameters to target networks utils.hard_update(self.target_actor, self.actor) utils.hard_update(self.target_critic, self.critic)
def __init__(self, gamma, tau, hidden_size, num_inputs, action_space): self.num_inputs = num_inputs self.action_space = action_space self.actor = Actor(hidden_size, self.num_inputs, self.action_space) self.actor_target = Actor(hidden_size, self.num_inputs, self.action_space) self.actor_perturbed = Actor(hidden_size, self.num_inputs, self.action_space) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(hidden_size, self.num_inputs, self.action_space) self.critic_target = Critic(hidden_size, self.num_inputs, self.action_space) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = gamma self.tau = tau hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic)
def __init__(self, state_dim, action_dim, action_lim, ram): """ :param state_dim: Dimensions of state (int) :param action_dim: Dimension of action (int) :param action_lim: Used to limit action in [-action_lim,action_lim] :param ram: replay memory buffer object :return: """ self.state_dim = state_dim self.action_dim = action_dim self.action_lim = action_lim self.ram = ram self.iter = 0 self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim) self.actor = model.Actor(self.state_dim, self.action_dim, self.action_lim) self.target_actor = model.Actor(self.state_dim, self.action_dim, self.action_lim) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), LEARNING_RATE, weight_decay=1e-5) self.critic = model.Critic(self.state_dim, self.action_dim) self.target_critic = model.Critic(self.state_dim, self.action_dim) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), LEARNING_RATE * 10, weight_decay=1e-5) if (USEGPU): self.target_actor = self.target_actor.cuda() self.actor = self.actor.cuda() self.target_critic = self.target_critic.cuda() self.critic = self.critic.cuda() utils.hard_update(self.target_actor, self.actor) utils.hard_update(self.target_critic, self.critic)
def __init__(self, state_dim, action_dim, ram, LR_actor, LR_critic, gamma, tau, batchsize, expl_rate, version): """ :param state_dim: Dimensions of state (int) :param action_dim: Dimension of action (int) :param action_lim: Used to limit action in [-action_lim,action_lim] :param ram: replay memory buffer object :return: """ self.state_dim = state_dim self.action_dim = action_dim self.LR_actor = LR_actor self.LR_critic = LR_critic self.gamma = gamma self.tau = tau self.ram = ram self.batchsize = batchsize self.iter = 0 self.noise = utils.OrnsteinUhlenbeckActionNoise( self.action_dim, 0, 0.15, expl_rate) self.action_lim = 1.0 self.actor = model.Actor(self.state_dim, self.action_dim, self.action_lim) self.target_actor = model.Actor(self.state_dim, self.action_dim, self.action_lim) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.LR_actor) self.critic = model.Critic(self.state_dim, self.action_dim) self.target_critic = model.Critic(self.state_dim, self.action_dim) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), self.LR_critic) utils.hard_update(self.target_actor, self.actor) utils.hard_update(self.target_critic, self.critic)
def __init__(self, env: Env = None, capacity=2e6, batch_size=128, action_lim=1, learning_rate=0.001, gamma=0.999, epochs=2): if env is None: raise "agent 缺少环境env" super(DDPGAgent, self).__init__(env, capacity) self.state_dim = env.observation_space.shape[0] # 状态连续,获取状态维度 self.action_dim = env.action_space.shape[0] # 行为连续,获取行为维度 self.action_lim = action_lim # 行为值限制 self.batch_size = batch_size # 批学习一次状态转换数量 self.learning_rate = learning_rate # 学习率 self.gamma = 0.999 # 衰减因子 self.epochs = epochs # 统一批状态转换学习的次数 self.tau = 0.01 # 软拷贝的系数 self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim) self.actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.target_actor = Actor(self.state_dim, self.action_dim, self.action_lim) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate) self.critic = Critic(self.state_dim, self.action_dim) self.target_critic = Critic(self.state_dim, self.action_dim) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), self.learning_rate) # 初始化 # 将critic和actor网络的参数硬拷贝到目标网络target_critic和target_actor hard_update(self.target_actor, self.actor) # 硬拷贝 hard_update(self.target_critic, self.critic) # 硬拷贝 return
def __init__(self, act_sp, ob_sp, all_obs, all_acts, hidden_dim=64): self.act_sp = act_sp self.ob_sp = ob_sp self.policy = MLPNetwork(ob_sp, act_sp, sac_policy=True, constrain_out=True, hidden_dim=hidden_dim).to(device) self.q_nets_n = 2 self.qnets = [] self.qnet_targs = [] self.q_optimizers = [] for i in range(self.q_nets_n): qnet = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device) qnet_targ = MLPNetwork(all_obs + all_acts, 1, constrain_out=False, hidden_dim=hidden_dim).to(device) qnet.to(device) qnet_targ.to(device) hard_update(qnet_targ, qnet) self.qnets.append(qnet) self.qnet_targs.append(qnet_targ) self.q_optimizers.append(optim.Adam(qnet.parameters(), lr=LR)) self.policy.to(device) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR) self.action_count = 0 self.use_warmup = True
def __init__(self, num_inputs, action_space, config): self.gamma = config['gamma'] self.tau = config['tau'] self.alpha = config['alpha'] self.policy_type = config['policy'] self.target_update_interval = config['target_update_interval'] self.automatic_entropy_tuning = config['automatic_entropy_tuning'] self.device = torch.device( 'cuda:' + str(config['cuda'])) if torch.cuda.is_available( ) and config['cuda'] >= 0 else torch.device('cpu') self.critic = QNetwork(num_inputs, action_space.shape[0], config['hidden_size']).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=config['lr']) self.critic_target = QNetwork(num_inputs, action_space.shape[0], config['hidden_size']).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=config['lr']) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], config['hidden_size'], action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config['lr'])
def load_model_params(self, dir, i_eps): params = torch.load('%s/policy_state_dict_%d.pkl' % (dir, i_eps)) self.policy_net.load_state_dict(params) utils.hard_update(self.target_net, self.policy_net)
# Agent if args.Qapproximation == 'baseline': agent = SAC_baseline(env.observation_space.shape[0], env.action_space, args) else: agent = SAC_fourier(env.observation_space.shape[0], env.action_space, args) agent.load_model(actor_path="./models/sac_actor_{}_{}_{}_{}_{}_{}_{}".format( 'miguelca_test01', args.Qapproximation, args.filter, args.TDfilter, str(args.noise), str(args.rnoise).replace('.', '_'), str(args.num_steps)), critic_path="./models/sac_critic_{}_{}_{}_{}_{}_{}_{}".format( 'miguelca_test01', args.Qapproximation, args.filter, args.TDfilter, str(args.noise), str(args.rnoise).replace('.', '_'), str(args.num_steps))) hard_update(agent.critic_target, agent.critic) #TesnorboardX writer = SummaryWriter(logdir='./runs/{}_SAC_eval_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # normalization constants action_scale = ((env.action_space.high - env.action_space.low) / 2.) action_bias = ((env.action_space.high + env.action_space.low) / 2.) # Training Loop total_numsteps = 0 updates = 0 action_history = list([]) action_history_w_noise = list([])