def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, isdueling: bool = False, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 动作维度 self.__device = device # 设备 self.__gamma = gamma # 衰减因子 self.__eps_start = eps_start # eps-greedy参数的初始值 self.__eps_final = eps_final # eps-greedy参数的最终值 self.__eps_decay = eps_decay # eps-greedy参数的衰减率 self.__eps = eps_start self.__r = random.Random() #随机浮点数 self.__r.seed(seed) # 随机数种子 ###修改项 if isdueling: # 使用DuelingDQN网络 self.__policy = DuelingDQN(action_dim, device).to(device) # 值函数网络 self.__target = DuelingDQN(action_dim, device).to(device) # target网络 else: self.__policy = DQN(action_dim, device).to(device) # 值函数网络 self.__target = DQN(action_dim, device).to(device) # target网络 if restore is None: if isdueling: self.__policy.apply(DuelingDQN.init_weights) # 初始化权重 else: self.__policy.apply(DQN.init_weights) # 初始化权重 ###修改项 else: self.__policy.load_state_dict(torch.load(restore)) # 将restore的数据加载到网络中 self.__target.load_state_dict(self.__policy.state_dict()) # 将policy参数赋给target,此时两个网络相同 self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, dueling: bool, restore: Optional[str] = None, stable_arg = 0.1, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__stable_arg = stable_arg if dueling: self.__policy = DuelingDQN(action_dim, device).to(device) self.__target = DuelingDQN(action_dim, device).to(device) else: self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) else: #if dueling: #self.__policy.Convs_load(restore) #else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, use_dueling=False, use_DDQN=False, use_PR=False, #Prioritized Experience Replay ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.use_dueling = use_dueling self.use_DDQN = use_DDQN self.use_PR = use_PR if not use_dueling: self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) else: self.__policy = Dueling_DQN(action_dim, device).to(device) self.__target = Dueling_DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(self.__policy.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, rlmodel: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) if rlmodel is None or rlmodel == "DQN": self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) else: print("rlmodel %s is not supported" % rlmodel) exit(-1) if restore is None: if rlmodel is None or rlmodel == "DQN": self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625 * 1.25**10, eps=1.5e-4, ) self.__scheduler = optim.lr_scheduler.StepLR(self.__optimizer, step_size=100_000, gamma=0.8) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) # 将所有最开始读取数据时的tensor变量copy一份到device所指定的GPU上去,之后的运算都在GPU上进行 self.__policy = DQN(action_dim, device).to(device) # policy network self.__target = DQN(action_dim, device).to(device) # target network if restore is None: self.__policy.apply(DQN.init_weights) # policy自定义参数初始化方式 else: self.__policy.load_state_dict( torch.load(restore)) # policy加载之前学习到的参数 self.__target.load_state_dict( self.__policy.state_dict()) # target拷贝policy的参数 self.__optimizer = optim.Adam( # 优化器采用Adam self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() # 将模型转变为evaluation(测试)模式,这样就可以排除BN和Dropout对测试的干扰
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 行动的数量 self.__device = device # 使用的设备 self.__gamma = gamma # 对未来的折扣? self.__eps_start = eps_start # 初始epsilon值 self.__eps_final = eps_final # 最终的epsilon值 self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # 实际的Q网络 self.__target = DQN(action_dim, device).to(device) # 展示固定的Q网络,作为目标 if restore is None: self.__policy.apply(DQN.init_weights) # 对权重进行初始化 else: self.__policy.load_state_dict(torch.load(restore)) # 加载已有的权重 self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, # 3 device: TorchDevice, # cuda gamma: float, # 0.99 seed: int, eps_start: float, # 1 eps_final: float, # 0.1 eps_decay: float, # 10000 restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 3 self.__device = device self.__gamma = gamma # 0.99 self.__eps_start = eps_start # 1 self.__eps_final = eps_final # 0.1 self.__eps_decay = eps_decay # 1e6 self.__eps = eps_start # 1 self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # 策略DQN self.__target = DQN(action_dim, device).to(device) # 目标DQN,减少目标计算与当前值的相关性 if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict( self.__policy.state_dict()) # 将策略网络中的权重同步到目标网络 self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) # 优化器 self.__target.eval() # 验证模式
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, #restore默认设为None ) -> None: self.__action_dim = action_dim #设置模型初始参数 self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) #可以设置运行在cpu还是gpu上 self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) #采用dqn初始权重 else: self.__policy.load_state_dict( torch.load(restore)) #否则更新为restore中的权重 self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( #优化器 self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim #动作可选维数 self.__device = device #设备 self.__gamma = gamma #γ值 self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # policy网络 self.__target = DQN(action_dim, device).to(device) # target网络 if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()