def saveBestModel(self): pathlib.Path('mdls/').mkdir(parents=True, exist_ok=True) state = { 'mdl': self.best_model.state_dict(), 'avgFeat': self.avgFeature } import datetime now = datetime.datetime.now() save_name = 'mdls/' + 'mdl_DATE-' + now.isoformat() + '.pth.tar' db.printInfo(save_name) torch.save(state, save_name)
def train(self): student = DQN_Trainer(args, self.env, 'Student_0') sampleFeat = student.featurefn(self.env.reset()) w_0 = torch.rand(sampleFeat.size(0), 1) w_0 /= w_0.norm(1) rwd_list = [] t_list = [] weights = [w_0] i = 1 # # Train zeroth student. student.train(w_0) studentFeat, studentRwd = student.gatherAverageFeature() rwd_list.append(studentRwd) t_list.append((self.expert_feat - studentFeat).norm().item()) # # Create first student. weights.append((self.expert_feat - studentFeat).view(-1, 1)) feature_bar_list = [studentFeat] feature_list = [studentFeat] # # Iterate training. n_iter = 20 for i in tqdm.tqdm(range(n_iter)): student = DQN_Trainer(args, self.env, 'Student_%d' % (i + 1)) student.train(weights[-1]) studentFeat, studentRwd = student.gatherAverageFeature() rwd_list.append(studentRwd) feature_list.append(studentFeat) feat_bar_next = feature_bar_list[-1] + ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (self.expert_feat - feature_bar_list[-1]).view(-1,1))\ / ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (feature_list[-1] - feature_bar_list[-1]).view(-1,1))\ * (feature_list[-1] - feature_bar_list[-1]) feature_bar_list.append(feat_bar_next) weights.append((self.expert_feat - feat_bar_next).view(-1, 1)) t_list.append((self.expert_feat - feat_bar_next).norm().item()) db.printInfo('t: ', t_list[-1]) db.printInfo(feat_bar_next) plt.figure() ax = plt.gca() ax.xaxis.set_major_locator(MaxNLocator(integer=True)) plt.plot(rwd_list) plt.title('Average Episode Reward') plt.xlabel('Student Number') plt.ylabel('Episode Length') plt.savefig('plts/avgRewardProgress.png') plt.figure() ax = plt.gca() ax.xaxis.set_major_locator(MaxNLocator(integer=True)) plt.plot(t_list) plt.title('L2 Policy Error') plt.xlabel('Student Number') plt.ylabel('Squared error of features of features') plt.savefig('plts/sqerr.png')
def __init__(self, args, env, name): # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() save_path = 'vids/%s/' % name pathlib.Path(save_path).mkdir(parents=True, exist_ok=True) self.env = env self.env = gym.wrappers.Monitor( env, save_path, video_callable=lambda episode_id: episode_id % 199 == 0) self.env.reset() self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.is_trained = False self.avgFeature = None if args.configStr is not None: self.is_trained = True pth = os.path.abspath(args.configStr) assert pathlib.Path(pth).exists() data = torch.load(pth) self.policy_net.load_state_dict(data['mdl']) if 'avgFeat' in data: self.avgFeature = data['avgFeat'] db.printInfo('LOADED MODEL') self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.best_model = None self.best_rwd = -float('inf') self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.001) self.memory = ReplayMemory(100000) self.NUM_UPDATE = 1 self.steps_done = 0 self.episode_durations = [] self.plot = args.plot self.name = name plt.ion() if self.plot: plt.figure() self.init_screen = self.get_screen() plt.imshow(self.get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(), interpolation='none') plt.title('Example extracted screen')
def gatherAverageFeature(self, _return_s_init=False): mus = [] s_init = [] mu = 0.0 t = 0 is_s_init = True gamma = 0.99 for i in range(memory.position): s, a, s_next, reward, done = memory.memory[t] if is_s_init: s_init.append(s) is_s_init = False mu += gamma**t * self.phi(s, a).flatten() # 对mu求和 t += 1 if done: mus.append(mu) mu = 0.0 t = 0 is_s_init = True mu_est = torch.tensor([0.0, 0.0, 0.0, 0.0]) for mu in mus: mu_est += mu mu_est /= len(mus) mu_est /= mu_est.norm(2) with torch.no_grad(): n_iter = 20 # 2000 rwd_sum = None for i in tqdm.tqdm(range(n_iter)): rwd, states = self.testModel(self.best_model, True) if rwd_sum is None: rwd_sum = rwd else: rwd_sum += rwd rwd_sum /= n_iter db.printInfo(mu_est) db.printInfo(rwd_sum) self.avgFeature = mu_est if _return_s_init: return mu_est, s_init, rwd_sum return mu_est, rwd_sum
def saveFigs(figs=None): if figs is None: figs = [plt.figure(n) for n in plt.get_fignums()] import pathlib save_dir = str(pathlib.Path().cwd()) + '/plts/' db.printInfo(save_dir) pathlib.Path(save_dir).mkdir(exist_ok=True) for fig in figs: title = fig.axes[0].get_title() for a in fig.axes: a.axis('off') a.set_title('') db.printInfo(title) if title == '.png': title = 'noName' save_file = save_dir + title + '.pdf' fig.savefig(save_file.replace(' ', '_'), bbox_inches='tight', pad_inches=0)
def gatherAverageFeature(self): with torch.no_grad(): n_iter = 2000 sample_sum = None rwd_sum = None for i in tqdm.tqdm(range(n_iter)): rwd, states = self.testModel(self.best_model, True) episodeMean = torch.stack(states).mean(0) if sample_sum is None: sample_sum = episodeMean rwd_sum = rwd else: sample_sum += episodeMean rwd_sum += rwd sample_sum /= n_iter rwd_sum /= n_iter db.printInfo(sample_sum) db.printInfo(rwd_sum) self.avgFeature = sample_sum return sample_sum, rwd_sum
def showProgress(self, e_num): means = 0 durations_t = torch.tensor(self.episode_durations, dtype=torch.float) if len(self.episode_durations) >= 100: means = durations_t[-100:-1].mean().item() db.printInfo('Episode %d/%d Duration: %d AVG: %d' % (e_num, self.num_episodes, durations_t[-1], means)) plt.figure(2) plt.clf() plt.title('Performance: %s' % self.name) plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(durations_t.numpy()) if self.plot: # Take 100 episode averages and plot them too if len(durations_t) >= 100: means = durations_t.unfold(0, 100, 1).mean(1).view(-1) means = torch.cat((torch.zeros(99), means)) plt.plot(means.numpy()) plt.pause(0.001) # pause a bit so that plots are updated
def create_network(self, blocks): models = nn.ModuleList() prev_filters = 3 out_filters = [] conv_id = 0 dynamic_count = 0 for block in blocks: if block['type'] == 'net' or block['type'] == 'learnet': prev_filters = int(block['channels']) continue elif block['type'] == 'convolutional': conv_id = conv_id + 1 batch_normalize = int(block['batch_normalize']) filters = int(block['filters']) kernel_size = int(block['size']) stride = int(block['stride']) is_pad = int(block['pad']) pad = (kernel_size - 1) / 2 if is_pad else 0 activation = block['activation'] groups = 1 bias = bool(int(block['bias'])) if 'bias' in block else True if self.is_dynamic(block): partial = int( block['partial']) if 'partial' in block else None Conv2d = dynamic_conv2d(dynamic_count == 0, partial=partial) dynamic_count += 1 else: Conv2d = self.c2d_old if 'groups' in block: groups = int(block['groups']) model = nn.Sequential() if batch_normalize: model.add_module( 'conv{0}'.format(conv_id), Conv2d(prev_filters, filters, kernel_size, stride, int(pad), groups=groups, bias=False)) model.add_module('bn{0}'.format(conv_id), self.bn2d(filters)) #model.add_module('bn{0}'.format(conv_id), BN2d(filters)) else: model.add_module( 'conv{0}'.format(conv_id), Conv2d(prev_filters, filters, kernel_size, stride, int(pad), groups=groups, bias=bias)) if activation == 'leaky': model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True)) elif activation == 'relu': model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True)) prev_filters = filters out_filters.append(prev_filters) models.append(model) elif block['type'] == 'maxpool': pool_size = int(block['size']) stride = int(block['stride']) if stride > 1: model = nn.MaxPool2d(pool_size, stride) else: model = MaxPoolStride1() out_filters.append(prev_filters) models.append(model) elif block['type'] == 'avgpool': model = GlobalAvgPool2d() out_filters.append(prev_filters) models.append(model) elif block['type'] == 'softmax': model = nn.Softmax() out_filters.append(prev_filters) models.append(model) elif block['type'] == 'cost': if block['_type'] == 'sse': model = nn.MSELoss(size_average=True) elif block['_type'] == 'L1': model = nn.L1Loss(size_average=True) elif block['_type'] == 'smooth': model = nn.SmoothL1Loss(size_average=True) out_filters.append(1) models.append(model) elif block['type'] == 'reorg': stride = int(block['stride']) prev_filters = stride * stride * prev_filters out_filters.append(prev_filters) models.append(Reorg(stride)) elif block['type'] == 'route': layers = block['layers'].split(',') ind = len(models) layers = [ int(i) if int(i) > 0 else int(i) + ind for i in layers ] if len(layers) == 1: prev_filters = out_filters[layers[0]] elif len(layers) == 2: assert (layers[0] == ind - 1) prev_filters = out_filters[layers[0]] + out_filters[ layers[1]] out_filters.append(prev_filters) models.append(EmptyModule()) elif block['type'] == 'shortcut': ind = len(models) prev_filters = out_filters[ind - 1] out_filters.append(prev_filters) models.append(EmptyModule()) elif block['type'] == 'connected': filters = int(block['output']) if block['activation'] == 'linear': db.printInfo( 'Linear needs to have an init weight function') exit(0) model = nn.Linear(prev_filters, filters) elif block['activation'] == 'leaky': model = nn.Sequential(nn.Linear(prev_filters, filters), nn.LeakyReLU(0.1, inplace=True)) elif block['activation'] == 'relu': model = nn.Sequential(nn.Linear(prev_filters, filters), nn.ReLU(inplace=True)) prev_filters = filters out_filters.append(prev_filters) models.append(model) elif block['type'] == 'region': loss = RegionLossV2() anchors = block['anchors'].split(',') loss.anchors = [float(i) for i in anchors] loss.num_classes = int(block['classes']) loss.num_anchors = int(block['num']) loss.anchor_step = len(loss.anchors) // loss.num_anchors loss.object_scale = float(block['object_scale']) loss.noobject_scale = float(block['noobject_scale']) loss.class_scale = float(block['class_scale']) loss.coord_scale = float(block['coord_scale']) out_filters.append(prev_filters) models.append(loss) elif block['type'] == 'globalmax': model = GlobalMaxPool2d() out_filters.append(prev_filters) models.append(model) elif block['type'] == 'globalavg': model = GlobalAvgPool2d() out_filters.append(prev_filters) models.append(model) elif block['type'] == 'split': splits = [int(sz) for sz in block['splits'].split(',')] model = Split(splits) prev_filters = splits[-1] out_filters.append(prev_filters) models.append(model) else: print('unknown type %s' % (block['type'])) # pdb.set_trace() return models
def train(self, rwd_weight=None): # # Train. for i_episode in tqdm.tqdm(range(self.num_episodes)): # # Initialize the environment and state state = torch.from_numpy(self.env.reset()).unsqueeze(0).to( self.device, dtype=torch.float) for t in count(): # # Select and perform an action action = self.select_action(state) next_state_np, reward, done, _ = self.env.step(action.item()) if self.plot and i_episode % 100 == 0: self.get_screen() next_state = torch.from_numpy(next_state_np).unsqueeze(0).to( self.device, dtype=torch.float) if rwd_weight is None: reward = torch.tensor([reward], device=self.device) x, x_dot, theta, theta_dot = next_state_np r1 = (self.env.unwrapped.x_threshold - abs(x)) / self.env.unwrapped.x_threshold - 0.8 r2 = (self.env.unwrapped.theta_threshold_radians - abs(theta) ) / self.env.unwrapped.theta_threshold_radians - 0.5 # # Must be R ∈ [-1, 1] reward = torch.tensor([r1 + r2]) else: feat = self.featurefn(next_state_np) reward = rwd_weight.t() @ feat # # Observe new state if done: next_state = None # # Store the transition in self.memory self.memory.push(state, action, next_state, reward) # # Move to the next state state = next_state # # Perform one step of the optimization (on the target network) self.optimize_model() if done or t > 30000: self.episode_durations.append(t + 1) self.showProgress(i_episode) break # # Do not test the model until we have been through at least 100 policy_rwd = 0 if i_episode > 100: policy_rwd = self.testModel(self.policy_net) db.printInfo('Policy Reward: %d' % policy_rwd) # # Update the target network, copying all weights and biases in DQN if i_episode % self.TARGET_UPDATE == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) # # Done training. print('Complete') self.is_trained = True pathlib.Path('plts/').mkdir(parents=True, exist_ok=True) plt.savefig('plts/train-%s.png' % self.name) if self.plot: self.env.render() self.env.close() plt.ioff() plt.show()
def train(self): # student = DQN_Trainer(args, self.env, 'Student_0') student = discrete_BCQ( self.env, 'Student_0', False, self.env.action_space.n, self.env.observation_space.shape[0], self.device, args.plot, # 其余先使用默认值 optimizer_parameters={"lr": 3e-4}, #3e-4 ) # sampleFeat = student.featurefn_1(self.env.reset()) # 随机初始一个特征值[8] # w_0 = torch.randn(sampleFeat.size(0), 1) # 随机初始参数w (8,1) w_0 = torch.tensor([[0.5], [0.5], [0.5], [0.5]]) # 还是只针对状态吧 # w_0 = torch.tensor([[0.1],[0.2],[0.3],[0.4]]) # 还是只针对状态吧 w_0 /= w_0.norm(2) # 归一化 rwd_list = [] t_list = [] weights = [w_0] i = 1 # # 测试BCQ 是否正常运行,使用真实奖励 # for i in tqdm.tqdm(range(10)): # student.train(memory, w_0) # studentRwd = student.gatherAverageFeature() # bestreward = student.gatherAverageFeature(best=True) # Train zeroth student. student.train(memory, w_0) # 训练策略pi0 # 这个特征期望的获得居然是在线的,因此需要需要使用神经网络近似 # to do 训练特征网络mu0 studentFeat = student.train_feaexp(memory, self.s_init) studentRwd = student.gatherAverageFeature() # 得到策略pi0的平均特征和奖励 rwd_list.append(studentRwd) t_list.append((self.expert_feat - studentFeat).norm().item()) # 得到的是w1 # 投影法简化了问题:t是w的二范数,衡量两个特征之间的距离 # Create first student. weights.append((self.expert_feat - studentFeat).view(-1, 1)) feature_bar_list = [studentFeat] # 特征投影u-bar feature_list = [studentFeat] # 特征u # # Iterate training. n_iter = 6 # 20 for i in tqdm.tqdm(range(n_iter)): # student = DQN_Trainer(args, self.env, 'Student_%d' % (i + 1)) # 交互式地训练策略pii student = discrete_BCQ( self.env, 'Student_%d' % (i + 1), False, self.env.action_space.n, self.env.observation_space.shape[0], self.device, args.plot, # 其余先使用默认值 optimizer_parameters={"lr": 3e-4}, # 默认为-4 ) student.train(memory, weights[-1]) studentRwd = student.gatherAverageFeature() studentFeat = student.train_feaexp(memory, self.s_init) db.printInfo("studentFeat:", studentFeat) db.printInfo("self.expert_feat:", self.expert_feat) rwd_list.append(studentRwd) feature_list.append(studentFeat) feat_bar_next = feature_bar_list[-1] + ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (self.expert_feat - feature_bar_list[-1]).view(-1,1))\ / ((feature_list[-1] - feature_bar_list[-1]).view(-1, 1).t() @ (feature_list[-1] - feature_bar_list[-1]).view(-1,1))\ * (feature_list[-1] - feature_bar_list[-1]) # @矩阵乘法运算 db.printInfo("feature_bar:", feat_bar_next) feature_bar_list.append(feat_bar_next) weights.append((self.expert_feat - feat_bar_next).view(-1, 1)) t_list.append((self.expert_feat - feat_bar_next).norm().item()) db.printInfo('t: ', t_list[-1]) # db.printInfo(feat_bar_next) print('w:', weights[-1])
factor = 15. elif cfg.neg_ratio == 1: factor = 3.0 elif cfg.neg_ratio == 0: factor = 1.5 elif cfg.neg_ratio == 5: factor = 8.0 print('factor:', factor) learning_rate /= factor if use_cuda: if ngpus > 1: model = torch.nn.DataParallel(model).cuda() else: db.printInfo(torch.cuda.is_available()) model = model.cuda() optimizer = optim.SGD(model.parameters(), lr=learning_rate / batch_size, momentum=momentum, dampening=0, weight_decay=decay * batch_size * factor) def adjust_learning_rate(optimizer, batch): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" lr = learning_rate for i in range(len(steps)): scale = scales[i] if i < len(scales) else 1 if batch >= steps[i]: