def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,

            eps_start: float,
            eps_final: float,
            eps_decay: float,

            isdueling: bool = False,
            restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim  # 动作维度
        self.__device = device  # 设备
        self.__gamma = gamma    # 衰减因子

        self.__eps_start = eps_start    # eps-greedy参数的初始值
        self.__eps_final = eps_final    # eps-greedy参数的最终值
        self.__eps_decay = eps_decay    # eps-greedy参数的衰减率

        self.__eps = eps_start
        self.__r = random.Random()                                 #随机浮点数
        self.__r.seed(seed)     # 随机数种子

        ###修改项
        if isdueling:   # 使用DuelingDQN网络
            self.__policy = DuelingDQN(action_dim, device).to(device)  # 值函数网络
            self.__target = DuelingDQN(action_dim, device).to(device)  # target网络
        else:
            self.__policy = DQN(action_dim, device).to(device)  # 值函数网络
            self.__target = DQN(action_dim, device).to(device)  # target网络

        if restore is None:
            if isdueling:
                self.__policy.apply(DuelingDQN.init_weights)  # 初始化权重
            else:
                self.__policy.apply(DQN.init_weights)  # 初始化权重
        ###修改项
        else:
            self.__policy.load_state_dict(torch.load(restore))    # 将restore的数据加载到网络中

        self.__target.load_state_dict(self.__policy.state_dict()) # 将policy参数赋给target,此时两个网络相同
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()
Ejemplo n.º 2
0
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,

            eps_start: float,
            eps_final: float,
            eps_decay: float,

            dueling: bool,
            restore: Optional[str] = None,
            stable_arg = 0.1,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)
        
        self.__stable_arg = stable_arg

        if dueling:
            self.__policy = DuelingDQN(action_dim, device).to(device)
            self.__target = DuelingDQN(action_dim, device).to(device)
        else:
            self.__policy = DQN(action_dim, device).to(device)
            self.__target = DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(DQN.init_weights)
        else:
            #if dueling:
            #self.__policy.Convs_load(restore)
            #else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()
Ejemplo n.º 3
0
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,
            eps_start: float,
            eps_final: float,
            eps_decay: float,
            restore: Optional[str] = None,
            use_dueling=False,
            use_DDQN=False,
            use_PR=False,  #Prioritized Experience Replay
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.use_dueling = use_dueling
        self.use_DDQN = use_DDQN
        self.use_PR = use_PR

        if not use_dueling:
            self.__policy = DQN(action_dim, device).to(device)
            self.__target = DQN(action_dim, device).to(device)
        else:
            self.__policy = Dueling_DQN(action_dim, device).to(device)
            self.__target = Dueling_DQN(action_dim, device).to(device)

        if restore is None:
            self.__policy.apply(self.__policy.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()
Ejemplo n.º 4
0
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
        rlmodel: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        if rlmodel is None or rlmodel == "DQN":
            self.__policy = DQN(action_dim, device).to(device)
            self.__target = DQN(action_dim, device).to(device)
        else:
            print("rlmodel %s is not supported" % rlmodel)
            exit(-1)

        if restore is None:
            if rlmodel is None or rlmodel == "DQN":
                self.__policy.apply(DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))

        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()
Ejemplo n.º 5
0
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)
        self.__target = DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625 * 1.25**10,
            eps=1.5e-4,
        )
        self.__scheduler = optim.lr_scheduler.StepLR(self.__optimizer,
                                                     step_size=100_000,
                                                     gamma=0.8)
        self.__target.eval()
Ejemplo n.º 6
0
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        # 将所有最开始读取数据时的tensor变量copy一份到device所指定的GPU上去,之后的运算都在GPU上进行
        self.__policy = DQN(action_dim, device).to(device)  # policy network
        self.__target = DQN(action_dim, device).to(device)  # target network
        if restore is None:
            self.__policy.apply(DQN.init_weights)  # policy自定义参数初始化方式
        else:
            self.__policy.load_state_dict(
                torch.load(restore))  # policy加载之前学习到的参数
        self.__target.load_state_dict(
            self.__policy.state_dict())  # target拷贝policy的参数
        self.__optimizer = optim.Adam(  # 优化器采用Adam
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()  # 将模型转变为evaluation(测试)模式,这样就可以排除BN和Dropout对测试的干扰
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,

            eps_start: float,
            eps_final: float,
            eps_decay: float,

            restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim # 行动的数量
        self.__device = device # 使用的设备
        self.__gamma = gamma # 对未来的折扣?

        self.__eps_start = eps_start # 初始epsilon值
        self.__eps_final = eps_final # 最终的epsilon值
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device) # 实际的Q网络
        self.__target = DQN(action_dim, device).to(device) # 展示固定的Q网络,作为目标
        if restore is None:
            self.__policy.apply(DQN.init_weights) # 对权重进行初始化
        else:
            self.__policy.load_state_dict(torch.load(restore)) # 加载已有的权重
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()
Ejemplo n.º 8
0
    def __init__(
        self,
        action_dim: int,  # 3
        device: TorchDevice,  # cuda
        gamma: float,  # 0.99
        seed: int,
        eps_start: float,  # 1
        eps_final: float,  # 0.1
        eps_decay: float,  # 10000
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim  # 3
        self.__device = device
        self.__gamma = gamma  # 0.99

        self.__eps_start = eps_start  # 1
        self.__eps_final = eps_final  # 0.1
        self.__eps_decay = eps_decay  # 1e6

        self.__eps = eps_start  # 1
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)  # 策略DQN
        self.__target = DQN(action_dim,
                            device).to(device)  # 目标DQN,减少目标计算与当前值的相关性

        if restore is None: self.__policy.apply(DQN.init_weights)
        else: self.__policy.load_state_dict(torch.load(restore))

        self.__target.load_state_dict(
            self.__policy.state_dict())  # 将策略网络中的权重同步到目标网络
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )  # 优化器
        self.__target.eval()  # 验证模式
Ejemplo n.º 9
0
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,
            eps_start: float,
            eps_final: float,
            eps_decay: float,
            restore: Optional[str] = None,  #restore默认设为None
    ) -> None:
        self.__action_dim = action_dim  #设置模型初始参数
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)  #可以设置运行在cpu还是gpu上
        self.__target = DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(DQN.init_weights)  #采用dqn初始权重
        else:
            self.__policy.load_state_dict(
                torch.load(restore))  #否则更新为restore中的权重
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(  #优化器
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()
Ejemplo n.º 10
0
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim  #动作可选维数
        self.__device = device  #设备
        self.__gamma = gamma  #γ值

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)  # policy网络
        self.__target = DQN(action_dim, device).to(device)  # target网络

        if restore is None:
            self.__policy.apply(DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()