def saveBestModel(self):
     pathlib.Path('mdls/').mkdir(parents=True, exist_ok=True)
     state = {
         'mdl': self.best_model.state_dict(),
         'avgFeat': self.avgFeature
     }
     import datetime
     now = datetime.datetime.now()
     save_name = 'mdls/' + 'mdl_DATE-' + now.isoformat() + '.pth.tar'
     db.printInfo(save_name)
     torch.save(state, save_name)
 def train(self):
     student = DQN_Trainer(args, self.env, 'Student_0')
     sampleFeat = student.featurefn(self.env.reset())
     w_0 = torch.rand(sampleFeat.size(0), 1)
     w_0 /= w_0.norm(1)
     rwd_list = []
     t_list = []
     weights = [w_0]
     i = 1
     #
     # Train zeroth student.
     student.train(w_0)
     studentFeat, studentRwd = student.gatherAverageFeature()
     rwd_list.append(studentRwd)
     t_list.append((self.expert_feat - studentFeat).norm().item())
     #
     # Create first student.
     weights.append((self.expert_feat - studentFeat).view(-1, 1))
     feature_bar_list = [studentFeat]
     feature_list = [studentFeat]
     #
     # Iterate training.
     n_iter = 20
     for i in tqdm.tqdm(range(n_iter)):
         student = DQN_Trainer(args, self.env, 'Student_%d' % (i + 1))
         student.train(weights[-1])
         studentFeat, studentRwd = student.gatherAverageFeature()
         rwd_list.append(studentRwd)
         feature_list.append(studentFeat)
         feat_bar_next = feature_bar_list[-1] + ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (self.expert_feat - feature_bar_list[-1]).view(-1,1))\
                          / ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (feature_list[-1] - feature_bar_list[-1]).view(-1,1))\
                          * (feature_list[-1] - feature_bar_list[-1])
         feature_bar_list.append(feat_bar_next)
         weights.append((self.expert_feat - feat_bar_next).view(-1, 1))
         t_list.append((self.expert_feat - feat_bar_next).norm().item())
         db.printInfo('t: ', t_list[-1])
     db.printInfo(feat_bar_next)
     plt.figure()
     ax = plt.gca()
     ax.xaxis.set_major_locator(MaxNLocator(integer=True))
     plt.plot(rwd_list)
     plt.title('Average Episode Reward')
     plt.xlabel('Student Number')
     plt.ylabel('Episode Length')
     plt.savefig('plts/avgRewardProgress.png')
     plt.figure()
     ax = plt.gca()
     ax.xaxis.set_major_locator(MaxNLocator(integer=True))
     plt.plot(t_list)
     plt.title('L2 Policy Error')
     plt.xlabel('Student Number')
     plt.ylabel('Squared error of features of features')
     plt.savefig('plts/sqerr.png')
    def __init__(self, args, env, name):
        # Get screen size so that we can initialize layers correctly based on shape
        # returned from AI gym. Typical dimensions at this point are close to 3x40x90
        # which is the result of a clamped and down-scaled render buffer in get_screen()
        save_path = 'vids/%s/' % name
        pathlib.Path(save_path).mkdir(parents=True, exist_ok=True)
        self.env = env
        self.env = gym.wrappers.Monitor(
            env,
            save_path,
            video_callable=lambda episode_id: episode_id % 199 == 0)
        self.env.reset()
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.is_trained = False
        self.avgFeature = None
        if args.configStr is not None:
            self.is_trained = True
            pth = os.path.abspath(args.configStr)
            assert pathlib.Path(pth).exists()
            data = torch.load(pth)
            self.policy_net.load_state_dict(data['mdl'])
            if 'avgFeat' in data:
                self.avgFeature = data['avgFeat']
            db.printInfo('LOADED MODEL')

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.best_model = None
        self.best_rwd = -float('inf')

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.001)
        self.memory = ReplayMemory(100000)

        self.NUM_UPDATE = 1
        self.steps_done = 0
        self.episode_durations = []
        self.plot = args.plot
        self.name = name
        plt.ion()
        if self.plot:
            plt.figure()
            self.init_screen = self.get_screen()
            plt.imshow(self.get_screen().cpu().squeeze(0).permute(1, 2,
                                                                  0).numpy(),
                       interpolation='none')
            plt.title('Example extracted screen')
Exemple #4
0
    def gatherAverageFeature(self, _return_s_init=False):

        mus = []
        s_init = []
        mu = 0.0
        t = 0
        is_s_init = True
        gamma = 0.99

        for i in range(memory.position):
            s, a, s_next, reward, done = memory.memory[t]
            if is_s_init:
                s_init.append(s)
                is_s_init = False
            mu += gamma**t * self.phi(s, a).flatten()  # 对mu求和
            t += 1

            if done:
                mus.append(mu)
                mu = 0.0
                t = 0
                is_s_init = True

        mu_est = torch.tensor([0.0, 0.0, 0.0, 0.0])
        for mu in mus:
            mu_est += mu
        mu_est /= len(mus)
        mu_est /= mu_est.norm(2)
        with torch.no_grad():
            n_iter = 20  # 2000
            rwd_sum = None
            for i in tqdm.tqdm(range(n_iter)):
                rwd, states = self.testModel(self.best_model, True)
                if rwd_sum is None:
                    rwd_sum = rwd
                else:
                    rwd_sum += rwd
            rwd_sum /= n_iter
            db.printInfo(mu_est)
            db.printInfo(rwd_sum)
        self.avgFeature = mu_est
        if _return_s_init:
            return mu_est, s_init, rwd_sum
        return mu_est, rwd_sum
def saveFigs(figs=None):
    if figs is None:
        figs = [plt.figure(n) for n in plt.get_fignums()]
    import pathlib
    save_dir = str(pathlib.Path().cwd()) + '/plts/'
    db.printInfo(save_dir)
    pathlib.Path(save_dir).mkdir(exist_ok=True)
    for fig in figs:
        title = fig.axes[0].get_title()
        for a in fig.axes:
            a.axis('off')
            a.set_title('')
        db.printInfo(title)
        if title == '.png':
            title = 'noName'
        save_file = save_dir + title + '.pdf'
        fig.savefig(save_file.replace(' ', '_'),
                    bbox_inches='tight',
                    pad_inches=0)
 def gatherAverageFeature(self):
     with torch.no_grad():
         n_iter = 2000
         sample_sum = None
         rwd_sum = None
         for i in tqdm.tqdm(range(n_iter)):
             rwd, states = self.testModel(self.best_model, True)
             episodeMean = torch.stack(states).mean(0)
             if sample_sum is None:
                 sample_sum = episodeMean
                 rwd_sum = rwd
             else:
                 sample_sum += episodeMean
                 rwd_sum += rwd
         sample_sum /= n_iter
         rwd_sum /= n_iter
         db.printInfo(sample_sum)
         db.printInfo(rwd_sum)
     self.avgFeature = sample_sum
     return sample_sum, rwd_sum
    def showProgress(self, e_num):
        means = 0
        durations_t = torch.tensor(self.episode_durations, dtype=torch.float)
        if len(self.episode_durations) >= 100:
            means = durations_t[-100:-1].mean().item()
        db.printInfo('Episode %d/%d Duration: %d AVG: %d' %
                     (e_num, self.num_episodes, durations_t[-1], means))
        plt.figure(2)
        plt.clf()
        plt.title('Performance: %s' % self.name)
        plt.xlabel('Episode')
        plt.ylabel('Duration')
        plt.plot(durations_t.numpy())
        if self.plot:
            # Take 100 episode averages and plot them too
            if len(durations_t) >= 100:
                means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
                means = torch.cat((torch.zeros(99), means))
                plt.plot(means.numpy())

            plt.pause(0.001)  # pause a bit so that plots are updated
Exemple #8
0
    def create_network(self, blocks):
        models = nn.ModuleList()

        prev_filters = 3
        out_filters = []
        conv_id = 0
        dynamic_count = 0
        for block in blocks:
            if block['type'] == 'net' or block['type'] == 'learnet':
                prev_filters = int(block['channels'])
                continue
            elif block['type'] == 'convolutional':
                conv_id = conv_id + 1
                batch_normalize = int(block['batch_normalize'])
                filters = int(block['filters'])
                kernel_size = int(block['size'])
                stride = int(block['stride'])
                is_pad = int(block['pad'])
                pad = (kernel_size - 1) / 2 if is_pad else 0
                activation = block['activation']
                groups = 1
                bias = bool(int(block['bias'])) if 'bias' in block else True

                if self.is_dynamic(block):
                    partial = int(
                        block['partial']) if 'partial' in block else None
                    Conv2d = dynamic_conv2d(dynamic_count == 0,
                                            partial=partial)
                    dynamic_count += 1
                else:
                    Conv2d = self.c2d_old
                if 'groups' in block:
                    groups = int(block['groups'])

                model = nn.Sequential()
                if batch_normalize:
                    model.add_module(
                        'conv{0}'.format(conv_id),
                        Conv2d(prev_filters,
                               filters,
                               kernel_size,
                               stride,
                               int(pad),
                               groups=groups,
                               bias=False))
                    model.add_module('bn{0}'.format(conv_id),
                                     self.bn2d(filters))
                    #model.add_module('bn{0}'.format(conv_id), BN2d(filters))
                else:
                    model.add_module(
                        'conv{0}'.format(conv_id),
                        Conv2d(prev_filters,
                               filters,
                               kernel_size,
                               stride,
                               int(pad),
                               groups=groups,
                               bias=bias))
                if activation == 'leaky':
                    model.add_module('leaky{0}'.format(conv_id),
                                     nn.LeakyReLU(0.1, inplace=True))
                elif activation == 'relu':
                    model.add_module('relu{0}'.format(conv_id),
                                     nn.ReLU(inplace=True))
                prev_filters = filters
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'maxpool':
                pool_size = int(block['size'])
                stride = int(block['stride'])
                if stride > 1:
                    model = nn.MaxPool2d(pool_size, stride)
                else:
                    model = MaxPoolStride1()
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'avgpool':
                model = GlobalAvgPool2d()
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'softmax':
                model = nn.Softmax()
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'cost':
                if block['_type'] == 'sse':
                    model = nn.MSELoss(size_average=True)
                elif block['_type'] == 'L1':
                    model = nn.L1Loss(size_average=True)
                elif block['_type'] == 'smooth':
                    model = nn.SmoothL1Loss(size_average=True)
                out_filters.append(1)
                models.append(model)
            elif block['type'] == 'reorg':
                stride = int(block['stride'])
                prev_filters = stride * stride * prev_filters
                out_filters.append(prev_filters)
                models.append(Reorg(stride))
            elif block['type'] == 'route':
                layers = block['layers'].split(',')
                ind = len(models)
                layers = [
                    int(i) if int(i) > 0 else int(i) + ind for i in layers
                ]
                if len(layers) == 1:
                    prev_filters = out_filters[layers[0]]
                elif len(layers) == 2:
                    assert (layers[0] == ind - 1)
                    prev_filters = out_filters[layers[0]] + out_filters[
                        layers[1]]
                out_filters.append(prev_filters)
                models.append(EmptyModule())
            elif block['type'] == 'shortcut':
                ind = len(models)
                prev_filters = out_filters[ind - 1]
                out_filters.append(prev_filters)
                models.append(EmptyModule())
            elif block['type'] == 'connected':
                filters = int(block['output'])
                if block['activation'] == 'linear':
                    db.printInfo(
                        'Linear needs to have an init weight function')
                    exit(0)
                    model = nn.Linear(prev_filters, filters)
                elif block['activation'] == 'leaky':

                    model = nn.Sequential(nn.Linear(prev_filters, filters),
                                          nn.LeakyReLU(0.1, inplace=True))
                elif block['activation'] == 'relu':
                    model = nn.Sequential(nn.Linear(prev_filters, filters),
                                          nn.ReLU(inplace=True))
                prev_filters = filters
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'region':
                loss = RegionLossV2()
                anchors = block['anchors'].split(',')
                loss.anchors = [float(i) for i in anchors]
                loss.num_classes = int(block['classes'])
                loss.num_anchors = int(block['num'])
                loss.anchor_step = len(loss.anchors) // loss.num_anchors
                loss.object_scale = float(block['object_scale'])
                loss.noobject_scale = float(block['noobject_scale'])
                loss.class_scale = float(block['class_scale'])
                loss.coord_scale = float(block['coord_scale'])
                out_filters.append(prev_filters)
                models.append(loss)
            elif block['type'] == 'globalmax':
                model = GlobalMaxPool2d()
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'globalavg':
                model = GlobalAvgPool2d()
                out_filters.append(prev_filters)
                models.append(model)
            elif block['type'] == 'split':
                splits = [int(sz) for sz in block['splits'].split(',')]
                model = Split(splits)
                prev_filters = splits[-1]
                out_filters.append(prev_filters)
                models.append(model)
            else:
                print('unknown type %s' % (block['type']))

        # pdb.set_trace()
        return models
 def train(self, rwd_weight=None):
     #
     # Train.
     for i_episode in tqdm.tqdm(range(self.num_episodes)):
         #
         # Initialize the environment and state
         state = torch.from_numpy(self.env.reset()).unsqueeze(0).to(
             self.device, dtype=torch.float)
         for t in count():
             #
             # Select and perform an action
             action = self.select_action(state)
             next_state_np, reward, done, _ = self.env.step(action.item())
             if self.plot and i_episode % 100 == 0:
                 self.get_screen()
             next_state = torch.from_numpy(next_state_np).unsqueeze(0).to(
                 self.device, dtype=torch.float)
             if rwd_weight is None:
                 reward = torch.tensor([reward], device=self.device)
                 x, x_dot, theta, theta_dot = next_state_np
                 r1 = (self.env.unwrapped.x_threshold -
                       abs(x)) / self.env.unwrapped.x_threshold - 0.8
                 r2 = (self.env.unwrapped.theta_threshold_radians -
                       abs(theta)
                       ) / self.env.unwrapped.theta_threshold_radians - 0.5
                 #
                 # Must be R ∈ [-1, 1]
                 reward = torch.tensor([r1 + r2])
             else:
                 feat = self.featurefn(next_state_np)
                 reward = rwd_weight.t() @ feat
             #
             # Observe new state
             if done:
                 next_state = None
             #
             # Store the transition in self.memory
             self.memory.push(state, action, next_state, reward)
             #
             # Move to the next state
             state = next_state
             #
             # Perform one step of the optimization (on the target network)
             self.optimize_model()
             if done or t > 30000:
                 self.episode_durations.append(t + 1)
                 self.showProgress(i_episode)
                 break
         #
         # Do not test the model until we have been through at least 100
         policy_rwd = 0
         if i_episode > 100:
             policy_rwd = self.testModel(self.policy_net)
             db.printInfo('Policy Reward: %d' % policy_rwd)
         #
         # Update the target network, copying all weights and biases in DQN
         if i_episode % self.TARGET_UPDATE == 0:
             self.target_net.load_state_dict(self.policy_net.state_dict())
     #
     # Done training.
     print('Complete')
     self.is_trained = True
     pathlib.Path('plts/').mkdir(parents=True, exist_ok=True)
     plt.savefig('plts/train-%s.png' % self.name)
     if self.plot:
         self.env.render()
         self.env.close()
         plt.ioff()
         plt.show()
Exemple #10
0
    def train(self):
        # student = DQN_Trainer(args, self.env, 'Student_0')
        student = discrete_BCQ(
            self.env,
            'Student_0',
            False,
            self.env.action_space.n,
            self.env.observation_space.shape[0],
            self.device,
            args.plot,
            # 其余先使用默认值
            optimizer_parameters={"lr": 3e-4},  #3e-4
        )
        # sampleFeat = student.featurefn_1(self.env.reset())  # 随机初始一个特征值[8]
        # w_0 = torch.randn(sampleFeat.size(0), 1)  # 随机初始参数w  (8,1)
        w_0 = torch.tensor([[0.5], [0.5], [0.5], [0.5]])  # 还是只针对状态吧
        # w_0 = torch.tensor([[0.1],[0.2],[0.3],[0.4]])  # 还是只针对状态吧
        w_0 /= w_0.norm(2)  # 归一化
        rwd_list = []
        t_list = []
        weights = [w_0]
        i = 1
        #
        # 测试BCQ 是否正常运行,使用真实奖励
        # for i in tqdm.tqdm(range(10)):
        #     student.train(memory, w_0)
        #     studentRwd = student.gatherAverageFeature()
        #     bestreward = student.gatherAverageFeature(best=True)

        # Train zeroth student.
        student.train(memory, w_0)  # 训练策略pi0
        # 这个特征期望的获得居然是在线的,因此需要需要使用神经网络近似

        # to do 训练特征网络mu0
        studentFeat = student.train_feaexp(memory, self.s_init)

        studentRwd = student.gatherAverageFeature()  # 得到策略pi0的平均特征和奖励
        rwd_list.append(studentRwd)
        t_list.append((self.expert_feat - studentFeat).norm().item())  # 得到的是w1
        # 投影法简化了问题:t是w的二范数,衡量两个特征之间的距离
        # Create first student.
        weights.append((self.expert_feat - studentFeat).view(-1, 1))
        feature_bar_list = [studentFeat]  # 特征投影u-bar
        feature_list = [studentFeat]  # 特征u
        #
        # Iterate training.
        n_iter = 6  # 20
        for i in tqdm.tqdm(range(n_iter)):
            # student = DQN_Trainer(args, self.env, 'Student_%d' % (i + 1))  # 交互式地训练策略pii
            student = discrete_BCQ(
                self.env,
                'Student_%d' % (i + 1),
                False,
                self.env.action_space.n,
                self.env.observation_space.shape[0],
                self.device,
                args.plot,
                # 其余先使用默认值
                optimizer_parameters={"lr": 3e-4},  # 默认为-4
            )
            student.train(memory, weights[-1])
            studentRwd = student.gatherAverageFeature()
            studentFeat = student.train_feaexp(memory, self.s_init)

            db.printInfo("studentFeat:", studentFeat)

            db.printInfo("self.expert_feat:", self.expert_feat)

            rwd_list.append(studentRwd)
            feature_list.append(studentFeat)
            feat_bar_next = feature_bar_list[-1] + ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (self.expert_feat - feature_bar_list[-1]).view(-1,1))\
                             / ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (feature_list[-1] - feature_bar_list[-1]).view(-1,1))\
                             * (feature_list[-1] - feature_bar_list[-1])  # @矩阵乘法运算
            db.printInfo("feature_bar:", feat_bar_next)
            feature_bar_list.append(feat_bar_next)

            weights.append((self.expert_feat - feat_bar_next).view(-1, 1))
            t_list.append((self.expert_feat - feat_bar_next).norm().item())
            db.printInfo('t: ', t_list[-1])
        # db.printInfo(feat_bar_next)
        print('w:', weights[-1])
    factor = 15.
elif cfg.neg_ratio == 1:
    factor = 3.0
elif cfg.neg_ratio == 0:
    factor = 1.5
elif cfg.neg_ratio == 5:
    factor = 8.0

print('factor:', factor)
learning_rate /= factor

if use_cuda:
    if ngpus > 1:
        model = torch.nn.DataParallel(model).cuda()
    else:
        db.printInfo(torch.cuda.is_available())
        model = model.cuda()

optimizer = optim.SGD(model.parameters(),
                      lr=learning_rate / batch_size,
                      momentum=momentum,
                      dampening=0,
                      weight_decay=decay * batch_size * factor)


def adjust_learning_rate(optimizer, batch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = learning_rate
    for i in range(len(steps)):
        scale = scales[i] if i < len(scales) else 1
        if batch >= steps[i]: