def learn(self, memory: ReplayMemory, batch_size: int, choice: int) -> float:
        """learn trains the value network via TD-learning."""

        ##修改项
        if (choice == 0):   # 普通memory
            state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample(batch_size)
        else:       # PERmemory
            state_batch, action_batch, reward_batch, next_batch, done_batch, idx_batch = \
                memory.sample(batch_size)  ####

        values = self.__policy(state_batch.float()).gather(1, action_batch)  # 每一列按action_batch取元素
        values_next = self.__target(next_batch.float()).max(1).values.detach()  # 最大的作为下一个的value
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
                   (1. - done_batch) + reward_batch
        # Loss=values-expected
        if (choice == 0):  #####
            loss = F.smooth_l1_loss(values, expected)
        else:       # PERmemory
            loss_batch = F.smooth_l1_loss(values, expected, reduction='none')  # TD error
            loss = torch.mean(loss_batch, dim=0)
            # loss.requires_grad = True
            memory.update(loss_batch.detach(), idx_batch)
        ##修改项

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():  # 把参数加紧到[-1,1],原地修改
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
Example #2
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        idxs, (state_batch, next_batch, action_batch, reward_batch,
               done_batch), is_weights = memory.sample(batch_size)

        y_batch = []
        current_Q_batch = self.policy(next_batch).cpu().data.numpy()
        max_action_next = np.argmax(current_Q_batch, axis=1)
        target_Q_batch = self.target(next_batch)

        for i in range(batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                target_Q_value = target_Q_batch[i, max_action_next[i]]
                y_batch.append(reward_batch[i] + self.__gamma * target_Q_value)

        y_batch = torch.stack(y_batch)
        values = self.policy(state_batch).gather(1, action_batch)

        abs_error = torch.abs(y_batch - values)
        memory.batch_update(idxs, abs_error)

        loss = (torch.FloatTensor(is_weights).to(self.__device) *
                F.mse_loss(values, y_batch)).mean()

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
Example #3
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch,indices = \
            memory.sample(batch_size)

        values = self.__policy(state_batch.float()).gather(1, action_batch)
        #print(type(values),values.size())
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch

        #print(type(values))
        #print(type(values_next))
        #print(values.size(),values_next.size(),expected.size())
        error = reward_batch + self.__gamma * values_next.unsqueeze(1) - values
        #error = values_next.unsqueeze(1) - values
        for i in range(batch_size):
            memory.Tree.update(
                int(indices[i]) + memory.Tree.capacity - 1,
                abs(float(error[i])))
        #print(error.size())
        #print(error)

        loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
Example #4
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        # 从replay buffer当中采样!!用于更新policy网络
        state_batch, action_batch, reward_batch, next_batch, done_batch = \
            memory.sample(batch_size)   #将所有变量转为张量。
        #使用行为网络计算值函数 Q_j
        values = self.__policy(state_batch.float()).gather(
            1, action_batch)  # Q_j对应有state_batch和action_batch

        #使用目标网络计算 Q_{j+1}并计算 expected = r_{j+1} + max(a') Q_{j+1}
        #其中(1-done_batch)用于判断是否terminal,如果是就退化到expected = r_{j+1}
        #这里相当于q-learning中的更新公式的一部分。在target网络中计算Q值。
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch

        #根据目标函数 (Q_j - expected)^2来梯度下降
        loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
Example #5
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:  # 返回vlue
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch = \
            memory.sample(batch_size)#随机选取一个样本
        # SGD优化的基本要求之一是训练数据是独立且均匀分布的
        # 当Agent与环境交互时,经验元组的序列可以高度相关,所以要打乱采样
        #将样本送入学习
        values = self.__policy(state_batch.float()).gather(
            1, action_batch)  #Q表:value=Q(s,a)

        ##########dueling DQN修改思路:拆分Q(s,a)=V(s)+A(s,a),其中V(s)为状态s本身的价值,A(s,a)为动作a的价值##########
        ##########V(s)是一个标量,A(s,a)是一个向量。在相加时V(s)会自动复制到与A(s,a)维度一致##########

        values_next = self.__target(
            next_batch.float()).max(1).values.detach()  #Q'表,但是具体参数不清楚
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch         #当完成了(done=1),y_j=r_j;否则y_j=r_j+q()见论文算法
        loss = F.smooth_l1_loss(values, expected)  # 损失函数

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        if self.use_PR:
            state_batch, action_batch, reward_batch, next_batch, done_batch, idxs, ISWeights = \
                memory.sample(batch_size)
        else:
            state_batch, action_batch, reward_batch, next_batch, done_batch = \
                memory.sample(batch_size)

        if self.use_DDQN:
            actions_value = self.__policy(next_batch.float())
            max_val_action = actions_value.max(1)[1].unsqueeze(-1)

            actions_value = self.__target(next_batch.float()).detach()
            expected = reward_batch + (self.__gamma * actions_value.gather(
                1, max_val_action)) * (1. - done_batch)
            values = self.__policy(state_batch.float()).gather(1, action_batch)
        else:
            values = self.__policy(state_batch.float()).gather(1, action_batch)
            values_next = self.__target(
                next_batch.float()).max(1).values.detach()
            expected = (self.__gamma * values_next.unsqueeze(1)) * \
                (1. - done_batch) + reward_batch

        if self.use_PR:
            abs_errors = torch.abs(expected - values).data.cpu().numpy()
            # update priority
            memory.batch_update(idxs, abs_errors)
            loss = (
                ISWeights *
                F.smooth_l1_loss(values, expected, reduction='none')).mean()
        else:
            loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        #for param in self.__policy.parameters():
        #    param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
Example #7
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning.应该是Q network """
        state_batch, action_batch, reward_batch, \
            next_batch, done_batch, idx_batch = memory.sample(batch_size)

        values = self.__policy(state_batch.float()).gather(1, action_batch)
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch                                        #TD target
        loss_batch = F.smooth_l1_loss(values, expected, reduce=False)               #TD error
        loss = torch.mean(loss_batch, dim=0)
        loss.requires_grad = True
        memory.update(loss_batch.detach(), idx_batch)
        
        self.__optimizer.zero_grad()
        loss.backward()                                                             #backward
        for param in self.__policy.parameters():
            if param.grad is not None:#grad clamp to (-1,1)
                param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()                                                     #update
        
        return loss.item()
Example #8
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch, indices = \
            memory.sample(batch_size)

        values = self.__policy(state_batch.float()).gather(1, action_batch)
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
                   (1. - done_batch) + reward_batch

        loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        for i in range(batch_size):
            memory.update(indices[i], expected[i] - values[i])
        del (indices)

        return loss.item()
Example #9
0
 def __init__(self, n_state, n_action, device='cpu'):
     #params
     self.n_state = n_state
     self.n_action = n_action
     self.device=device
     self.epsilon = 1.0
     self.epsilon_decay = 0.999
     self.epsilon_min = 0.1
     self.discount_factor = 0.99
     self.learning_rate= 0.000625 #0.001
     self.batch_size= 32
     
     self.num_step =0
     self.num_exploration = 0
     self.num_train = 0
     self.model_update_interval = 10000
     
     self.train_start = 50000
     self.replay_memory_size = 100000
     # self.replay_memory = deque(maxlen=self.replay_memory_size)        
     
     self.memory = ReplayMemory(4 + 1, self.replay_memory_size, self.device)
     
     #model define
     #action-value function
     self.model = CNN(self.n_state, self.n_action).to(self.device)        
     self.loss = nn.MSELoss().to(self.device)
     self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1.5e-4)
        
     #target action-value function
     self.target_model = CNN(self.n_state, self.n_action).to(self.device)  
     
     self.model.apply(CNN.init_weights)
     self.target_model.eval()
     
     self.rand = random.Random()
     self.rand.seed(0)
Example #10
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        # 从replay buffer当中采样,从经验回放集合中采样batch_size个样本,计算当前目标Q值
        indices, (state_batch, next_batch, action_batch, reward_batch, done_batch), is_weights = \
            memory.sample(batch_size)
        # 使用行为网络计算值函数 Q_j
        values = self.__policy(state_batch).gather(1, action_batch)

        expected = []
        policy_Q_batch = self.__policy(next_batch).cpu().data.numpy()
        max_action_next = np.argmax(policy_Q_batch, axis=1)
        target_Q_batch = self.__target(next_batch)

        for i in range(batch_size):
            if done_batch[i]:
                expected.append(reward_batch[i])
            else:
                target_Q_value = target_Q_batch[i, max_action_next[i]]
                expected.append(reward_batch[i] +
                                self.__gamma * target_Q_value)

        expected = torch.stack(expected)
        TD_error = torch.abs(expected - values)
        memory.update(indices, TD_error)

        # 根据目标函数 (Q_j - expected)^2来梯度下降
        loss = (torch.FloatTensor(is_weights).to(self.__device) *
                F.mse_loss(values, expected)).mean()

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
Example #11
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        state_batch, action_batch, reward_batch, next_batch,weight_batch, done_batch = \
            memory.sample(batch_size)
        weight_batch = weight_batch.to(self.__device)
        values = self.__policy(state_batch.float()).gather(1, action_batch)
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch
        weight_batch /= weight_batch.mean()
        loss = F.smooth_l1_loss(weight_batch * values, weight_batch * expected)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()
Example #12
0
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:  # 训练
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample(
            batch_size)  # 在经验池中选取一组transition样本集(minibatch)
        values = self.__policy(state_batch.float()).gather(
            1, action_batch)  # DQN输出y
        values_next = self.__target(
            next_batch.float()).max(1).values.detach()  # max_a(Q(S',a))
        expected = (self.__gamma * values_next.unsqueeze(1)) * (
            1. - done_batch) + reward_batch  # Q-Learning计算Q(S,A)
        loss = F.smooth_l1_loss(values, expected)  # 计算误差
        # 更新网络参数三部曲
        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()  # 返回误差
new_seed = lambda: rand.randint(0, 1000_000)
os.mkdir(SAVE_PREFIX)  # 保存训练好的模型

torch.manual_seed(new_seed())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = MyEnv(device)  # 创建一个环境,用于跑atari这个游戏
agent = Agent(  # 创建一个agent
    env.get_action_dim(),  # 游戏中动作的数量,一共有三个,分别是左右和不动
    device,  # 训练使用的设备
    GAMMA,
    new_seed(),
    EPS_START,  # epsilon的开始值
    EPS_END,  # epsilon的最小值
    EPS_DECAY,  # epsilon递减的
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE,
                      device)  # 用来记录agent的动作于结果之间的联系,用于后面神经网络的训练

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True

progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")
for step in progressive:
    if done:
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)
Example #14
0
# The number of threads here needs to be adjusted based on the number of CPU cores available
torch.set_num_threads(4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = MyEnv(device)
agent = Agent(
    env.get_action_dim(),
    device,
    GAMMA,
    new_seed(),
    EPS_START,
    EPS_END,
    EPS_DECAY,
    restore=restore,
    rlmodel=rlmodel,
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True

progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")
for step in progressive:
    if done:
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)
Example #15
0
os.mkdir(SAVE_PREFIX)  # 创建目录保存模型

torch.manual_seed(new_seed())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
env = MyEnv(device)  # 环境
agent = Agent(  # 智能体
    env.get_action_dim(),  # 3
    device,  # cuda
    GAMMA,  # 0.99
    new_seed(),
    EPS_START,  # 1
    EPS_END,  # 0.1
    EPS_DECAY,  # 1e6
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)  # 初始化经验池

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True

progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")  # 可视化进度条
for step in progressive:
    if done:  # 开始新一轮环境
        observations, _, _ = env.reset()
        for obs in observations:
            obs_queue.append(obs)
Example #16
0
    for version in versions:
        #set_trace()
        print(version)
        dueling = False if version.find('dueling') == -1 else True
        stable = False if version.find('stable') == -1 else True
        if stable:
            action_queue = []
        env = MyEnv(device)
        agent = Agent(env.get_action_dim(), device, GAMMA, new_seed(),
                      EPS_START, EPS_END, EPS_DECAY, dueling, pretrained,
                      stable * 0.1)
        if version.find('PER') != -1:
            memory = PERMemory(STACK_SIZE + 1, MEM_SIZE, device)
            #memory = Memory_Buffer_PER(MEM_SIZE)
        else:
            memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)
            #memory = Memory_Buffer_PER(MEM_SIZE)

        #### Training ####
        obs_queue: deque = deque(maxlen=5)
        done = True

        avg_reward_arr = []

        progressive = tqdm(range(MAX_STEPS),
                           total=MAX_STEPS,
                           ncols=50,
                           leave=False,
                           unit="b")
        for step in progressive:
            if done:
Example #17
0
class DDQN:
    def __init__(self, n_state, n_action, device='cpu'):
        #params
        self.n_state = n_state
        self.n_action = n_action
        self.device=device
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.1
        self.discount_factor = 0.99
        self.learning_rate= 0.000625 #0.001
        self.batch_size= 32
        
        self.num_step =0
        self.num_exploration = 0
        self.num_train = 0
        self.model_update_interval = 10000
        
        self.train_start = 50000
        self.replay_memory_size = 100000
        # self.replay_memory = deque(maxlen=self.replay_memory_size)        
        
        self.memory = ReplayMemory(4 + 1, self.replay_memory_size, self.device)
        
        #model define
        #action-value function
        self.model = CNN(self.n_state, self.n_action).to(self.device)        
        self.loss = nn.MSELoss().to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, eps=1.5e-4)
           
        #target action-value function
        self.target_model = CNN(self.n_state, self.n_action).to(self.device)  
        
        self.model.apply(CNN.init_weights)
        self.target_model.eval()
        
        self.rand = random.Random()
        self.rand.seed(0)
        
    def reset_params(self):
        self.num_step =0
        self.num_exploration = 0
        
    def save_model(self, path):
        torch.save(self.model.state_dict(),path)
        
    def load_model(self, path):
        self.model.load_state_dict(torch.load(save_path))
        
    # def save_sample(self, sample):
    def save_sample(self, state_queue, action, reward, done):
        #sample = [state, action, reward, next_state, done]
        self.memory.push(state_queue, action, reward, done)
        # self.replay_memory.append(sample)


        
	#E-greedy in state
    def get_action(self, state, test= False):

        self.num_step +=1
        
        if test:
            epsilon = self.epilson_min
        else:
            epsilon = self.epsilon
            
        if np.random.rand(1) < epsilon:    
            self.num_exploration+=1
            #action = random.randrange(self.n_action)
            # action = env.action_space.sample()
            action= self.rand.randint(0, self.n_action-1)
        else:
            # T means torh.Tensor
            # print(np.shape(list(state)[1:]))
            with torch.no_grad():
                # T_state = torch.FloatTensor([list(state)[1:]]).to(self.device)            
            # T_state = T_state.permute(0, 3, 1, 2)
                T_q = self.model(state) 
                action = np.argmax(T_q.to('cpu').detach().numpy())        
        
        # action += 1
        return action
    
    def epsilon_update(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    
    #Update
    def train(self):
        
        state_batch, action_batch, reward_batch, next_batch, done_batch = self.memory.sample(self.batch_size)

        values = self.model(state_batch.float()).gather(1, action_batch)
        values_next = self.target_model(next_batch.float()).max(1).values.detach()
        expected = (self.discount_factor * values_next.unsqueeze(1)) * (1. - done_batch) + reward_batch
        loss = F.smooth_l1_loss(values, expected)

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        
        self.num_train += 1
        
        # mini_batch = random.sample(self.replay_memory, self.batch_size)
        
        # #print(np.shape(np.array(mini_batch[:,0])))

        # # print(np.shape(mini_batch))
        # mini_batch = np.array(mini_batch)
        

        # T_states = np.stack(mini_batch[:,0])[:,:4]
        # T_actions = np.stack(mini_batch[:,1])
        # T_rewards = np.stack(mini_batch[:,2])
        # T_next_states = np.stack(mini_batch[:,0])[:,1:]
        # T_dones = np.stack(mini_batch[:,3])
        
        # T_states = torch.FloatTensor(T_states).to(self.device)
        # T_actions = torch.LongTensor(T_actions).to(self.device)
        # T_rewards = torch.FloatTensor(T_rewards).to(self.device)
        # T_next_states = torch.FloatTensor(T_next_states).to(self.device)
        # T_dones = torch.FloatTensor(T_dones).to(self.device)
        
        
        # T_q = self.model(T_states)        

        # #_ shows max value, T_next_q shows index of max value
        # _, T_next_q = self.model(T_next_states).detach().max(1)
       
        # T_next_tq = self.target_model(T_next_states).detach()
        # T_next_tq = T_next_tq.gather(1, T_next_q.unsqueeze(1))
        # T_next_a = T_next_tq.squeeze()

        # TD_target = torch.zeros((self.batch_size, self.n_action)).to(self.device)
        
        
        # for i in range(self.batch_size):
        #     TD_target[i][T_actions[i]] = T_rewards[i] + self.discount_factor*(1. - T_dones[i]) *T_next_a[i]

        # TD_target = TD_target.detach()       
        
        # self.optimizer.zero_grad()        
        # cost = self.loss(T_q, TD_target).mean()
        # cost.backward()#Gradient calculation
        # self.optimizer.step()#Gradient update
        
        if self.num_train%self.model_update_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())
def choosememory(c):
    if c == 0:
        return ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)
    else:
        return PERMemory(STACK_SIZE + 1, MEM_SIZE, device)
Example #19
0
os.mkdir(SAVE_PREFIX)  # 在"./models"创建目录

torch.manual_seed(new_seed())  # 将new_seed赋值给cpu的随机数种子
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#创建设备:GPU/CPU?
device = torch.device("cpu")
env = MyEnv(device)
agent = Agent(  # 根据预设参数初始化
    env.get_action_dim(),  # 返回3,三个动作:["NOOP", "RIGHT", "LEFT"]
    device,
    GAMMA,
    new_seed(),
    EPS_START,
    EPS_END,
    EPS_DECAY,
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE,
                      device)  # 循环队列,三者分别对应通道、容量、设备,容量为MEM_SIZE=100_000

#### Training ####
obs_queue: deque = deque(maxlen=5)  #创建观察队列
done = True

progressive = tqdm(
    range(MAX_STEPS),
    total=MAX_STEPS,  #   预期的迭代次数
    ncols=50,  #  可以自定义进度条的总长度
    leave=False,
    unit="b")
for step in progressive:  #step=int
    if done:  #新一轮游戏?
        observations, _, _ = env.reset()
        for obs in observations: