def forward(self, x, m, s, seq_len, p=None): n, _, _ = x.shape z = self.encode(x) z = z[torch.arange(n), (seq_len - 1).type(dtype=torch.long)] pred = self.decode(x[:, :, 0], z) # index: 0-time, 1-flux, 2-flux_err euc, cos = distances(x, pred) if p is None: zc = torch.cat((z, euc, cos, m, s), dim=1) else: zc = torch.cat((z, euc, cos, m, s, p.unsqueeze(-1)), dim=1) logits = self.estimation_network(zc) gamma = self.softmax(logits) phi, mu, cov = compute_params(zc, gamma) return pred, zc, logits, phi, mu, cov
def prepare_filters(self, filters, ratio=1, neuralscale=False, iteration=None, search=False, descent_idx=14, prune_fname=None, num_classes=100, pruned_filters=None): if ratio != None: if neuralscale and iteration != 0: if search: # perform iterative search of params (architecture desecent) pkl_ld = pickle.load( open("prune_record/param{}.pk".format(iteration - 1), "rb")) alpha = pkl_ld["train_alpha"] beta = pkl_ld["train_beta"] else: # list of params (done iterative searching/architecture descent) pkl_ld = pickle.load( open("prune_record/" + prune_fname + ".pk", "rb"))["param"] alpha = pkl_ld[descent_idx][0] beta = pkl_ld[descent_idx][1] total_param = compute_params( [int(cfg * ratio) for cfg in sum(filters, [])], classes=num_classes, model='mobilenetv2', last=True) # fallback to uniform scaling if np.sum(beta) == 0: cfg_tmp = list(alpha) ratio_ = 1.2 cur_param = 0 while abs(cur_param - total_param) > 0.1 * total_param: cur_param = compute_params( [int(cfg * ratio_) for cfg in cfg_tmp], classes=num_classes, model='mobilenetv2', last=True) if cur_param < total_param: ratio_ += 0.05 else: ratio_ -= 0.05 filt_cnt = 0 new_config = [] for block in filters: block_cfg = [] for blk_sz in range(len(block)): filt = int(cfg_tmp[filt_cnt] * ratio_) if filt < 10: # filter count too low, add some stochasticity block_cfg.append(filt + 10) else: block_cfg.append(filt) filt_cnt += 1 new_config.append(block_cfg) filters = new_config else: tau = total_param # initialize tau approx_total_param = 0 while abs(approx_total_param - total_param) > int( 0.0005 * total_param): approx_filts = [] for a, b in zip(alpha, beta): approx_filts.append(a * tau**b) approx_total_param = compute_params( approx_filts, classes=num_classes, model='mobilenetv2', last=True) tau_update = 0 for a, b in zip(alpha, beta): tau_update += a * tau**b * b / tau if ((approx_total_param - total_param) * tau_update) > tau: tau *= 0.5 else: tau = tau - 1.0 * ( (approx_total_param - total_param) * tau_update) filt_cnt = 0 new_config = [] for idx, block in enumerate(filters): block_cfg = [] for blk_sz in range(len(block)): filt = int(alpha[filt_cnt] * tau**beta[filt_cnt]) if search: # only add stochasticity during architecture searching if filt < 10: # filter count too low, add some stochasticity block_cfg.append(filt + 10) else: block_cfg.append(filt) else: block_cfg.append(filt) filt_cnt += 1 new_config.append(block_cfg) print( new_config, "approx parameters: {} total parameters: {}".format( approx_total_param, total_param)) filters = new_config elif pruned_filters != None: total_param = compute_params( [int(cfg * ratio) for cfg in sum(filters, [])], classes=num_classes, model='mobilenetv2', last=True) cfg_tmp = pruned_filters ratio_ = 1.2 cur_param = 0 while abs(cur_param - total_param) > 0.005 * total_param: cur_param = compute_params( [int(cfg * ratio_) for cfg in cfg_tmp], classes=num_classes, model='mobilenetv2', last=True) if cur_param < total_param: ratio_ += 0.00005 else: ratio_ -= 0.00005 filt_cnt = 0 new_config = [] for block in filters: block_cfg = [] for blk_sz in range(len(block)): filt = int(cfg_tmp[filt_cnt] * ratio_) block_cfg.append(filt) filt_cnt += 1 new_config.append(block_cfg) filters = new_config print("pruned uniform", new_config, "cur_params:", cur_param, "total_params:", total_param) else: # uniform scale new_config = [] for idx, block in enumerate(filters): block_cfg = [] for blk_sz in range(len(block)): block_cfg.append(int(block[blk_sz] * ratio)) new_config.append(block_cfg) filters = new_config print(filters) return filters
# ##### # EARLY # ##### f = open('prune_record/' + args.prune_fname + '_0.csv', newline='') reader = csv.reader(f, delimiter=',') filters = [] for row in reader: filters.append(list(map(int, row))) filters = np.array(filters) # Compute total parameters total_params = [] for filt in filters: # over all iterations total_params.append( compute_params(filt, classes=num_classes, model=args.model)) total_params = np.array(total_params) lin_reg = linear_model.LinearRegression() ln_k = np.log(total_params) A = np.stack((ln_k, np.ones(ln_k.shape)), axis=1) b = np.log(filters) x = np.matmul( np.matmul(np.linalg.inv(np.matmul(A.transpose(), A)), A.transpose()), b) beta = x[0, :] alpha = np.exp(x[1, :]) filt = np.array([total_params**b for b in beta]) filt = np.multiply(filt.transpose(), alpha).transpose() print('early')
def prepare_filters(self, config, ratio=1, neuralscale=False, iteration=None, search=False, descent_idx=14, prune_fname=None, num_classes=100, pruned_filters=None): if ratio != None: # for ratio swiping if neuralscale and iteration != 0: # use proposed efficient scaling method if search: # perform iterative search of params (architecture desecent) pkl_ld = pickle.load( open("prune_record/param{}.pk".format(iteration - 1), "rb")) alpha = pkl_ld["train_alpha"] beta = pkl_ld["train_beta"] else: # list of params (done iterative searching/architecture descent) pkl_ld = pickle.load( open("prune_record/" + prune_fname + ".pk", "rb"))["param"] alpha = pkl_ld[descent_idx][0] beta = pkl_ld[descent_idx][1] # total_param = compute_params_(vgg11(config=self.convert_filters(self, template=config, filter_list=[int(cfg*ratio) for cfg in list(filter(lambda a: a != 'M', config))]), num_classes=num_classes) ) total_param = compute_params([ cfg * ratio for cfg in list(filter(lambda a: a != 'M', config)) ], classes=num_classes, model='vgg') tau = total_param # initialize tau for j in range(2000): approx_filts = [] for a, b in zip(alpha, beta): approx_filts.append(int(a * tau**b)) # approx_total_param = compute_params(vgg11(config=self.convert_filters(self, template=config, filter_list=approx_filts), num_classes=num_classes) ) approx_total_param = compute_params(approx_filts, classes=num_classes, model='vgg') tau_update = 0 for a, b in zip(alpha, beta): tau_update += a * tau**b * b / tau tau = tau - 50.0 * ( (approx_total_param - total_param) * tau_update) new_config = [] cfg_cnt = 0 for i in range(len(config)): if config[i] != 'M': new_config.append( int(alpha[cfg_cnt] * tau**beta[cfg_cnt])) cfg_cnt += 1 else: new_config.append(config[i]) # M print( new_config, "approx params: {} total params: {}".format( approx_total_param, total_param)) elif pruned_filters != None: total_param = compute_params([ cfg * ratio for cfg in list(filter(lambda a: a != 'M', config)) ], classes=num_classes, model='vgg') cfg_tmp = pruned_filters ratio_ = 1.2 cur_param = 0 while abs(cur_param - total_param) > 0.005 * total_param: cur_param = compute_params( [int(cfg * ratio_) for cfg in cfg_tmp], classes=num_classes, model='vgg') if cur_param < total_param: ratio_ += 0.00005 else: ratio_ -= 0.00005 filt_cnt = 0 new_config = [] for i in range(len(config)): if config[i] != 'M': new_config.append(int(cfg_tmp[filt_cnt] * ratio_)) filt_cnt += 1 else: new_config.append(config[i]) # M print("pruned uniform", new_config, "cur_params:", cur_param, "total_params:", total_param) else: # uniform scaling new_config = [] for i in range(len(config)): if config[i] != 'M': new_config.append(int(config[i] * ratio)) else: new_config.append(config[i]) # M else: new_config = config return new_config
y_pred = np.argmax(y_prob_val, axis=1) accuracy = (y_val == y_pred).sum() / len(y_val) cm = confusion_matrix(y_val, y_pred) plot_confusion_matrix(cm, idx2lab, args.name, "{}/{}_cm_norm.png".format(fig_path, args.arch), normalize=True) plot_confusion_matrix(cm, idx2lab, "{}, accuracy: {:.4f}".format(args.name, accuracy), "{}/{}_cm.png".format(fig_path, args.arch), normalize=False) softmax = torch.nn.Softmax(dim=1) if args.name == "asas_sn" or args.name == "toy": z_val = torch.tensor(val_features, dtype=torch.float, device=args.d) z_test = torch.tensor(test_features, dtype=torch.float, device=args.d) logits_val = torch.tensor(val_logits, dtype=torch.float, device=args.d) logits_test = torch.tensor(test_logits, dtype=torch.float, device=args.d) phi_val, mu_val, cov_val = compute_params(z_val, softmax(logits_val)) # phi_test, mu_test, cov_test = compute_params(z_test, softmax(logits_test)) val_energy = compute_energy(z_val, phi=phi_val, mu=mu_val, cov=cov_val, size_average=False).cpu().numpy() test_energy = compute_energy(z_test, phi=phi_val, mu=mu_val, cov=cov_val, size_average=False).cpu().numpy() labels = np.ones(len(y_test)) if args.name == "asas_sn": labels[y_test == 8] = 0 # class 8 is outlier elif args.name == "toy": labels[y_test == 3] = 0 labels[y_test == 4] = 0 # pdb.set_trace() scores_in = test_energy[labels == 1] scores_out = test_energy[labels == 0] average_precision = (labels == 0).sum() / len(labels)
def prune_neurons(self, optimizer): if self.dataset == "CIFAR10": num_classes = 10 elif self.dataset == "CIFAR100": num_classes = 100 elif self.dataset == "tinyimagenet": num_classes = 200 # set number of pruned neurons to be a certain percentage all_neuron_units, neuron_units = self._count_number_of_neurons() filters = self.compute_filter_number() targeted_filter = [filt * self.size for filt in filters] targeted_params = compute_params(targeted_filter, classes=num_classes, model=self.model_name) cur_params = compute_params(filters, classes=num_classes, model=self.model_name) print("Before: ", filters) ratio = 0.9 # while abs(cur_params - targeted_params) > int(targeted_params*0.0005): while targeted_params < cur_params: if self.method == 0: # network slimming all_criteria = torch.tensor([ abs(criterion) for layer_criteria in self.prune_criteria for criterion in layer_criteria[0] ]).cuda() prune_neurons_now = self.pruned_neurons + self.prune_per_iteration threshold_now = torch.sort(all_criteria)[0][prune_neurons_now] for layer, layer_criteria in enumerate(self.prune_criteria): for unit, criterion in enumerate(layer_criteria[0]): if abs(criterion) <= threshold_now: # do actual pruning self.pruning_gates[layer][unit] *= 0.0 self.parameters[layer].data[unit] *= 0.0 self.prune_criteria[layer][0].data[ unit] *= 0.0 # weight self.prune_criteria[layer][1].data[ unit] *= 0.0 # bias (not important) # count number of neurons all_neuron_units, neuron_units = self._count_number_of_neurons( ) self.pruned_neurons = all_neuron_units - neuron_units cur_filter = self.compute_filter_number() cur_params = compute_params(cur_filter, classes=num_classes, model=self.model_name) print(cur_params, cur_filter) elif self.method == 1: # uniformly pruned across all layers cur_filter = [int(filt * ratio) for filt in filters] cur_params = compute_params(cur_filter, classes=num_classes, model=self.model_name) ratio *= 0.999 else: print("No such method") exit() if self.method == 1: # weight magnitude for layer, target_filt in enumerate(cur_filter): layer_criteria = np.asarray([ torch.norm(filt, 1).data.cpu().item() for filt in self.prune_criteria[layer] ]).reshape(-1) # adaptively estimate threshold given a number of neurons to be removed threshold_now = np.sort(layer_criteria)[::-1][target_filt] for unit, criterion in enumerate(layer_criteria): if abs(criterion) <= threshold_now: # do actual pruning self.pruning_gates[layer][unit] *= 0.0 self.parameters[layer].data[unit] *= 0.0 self.prune_criteria[layer].data[unit, :] *= 0.0 cur_filter = self.compute_filter_number() # Set momentum buffer to 0 for layer in range(len(self.pruning_gates)): for unit in range(len(self.pruning_gates[layer])): if self.pruning_gates[layer][unit]: continue if 'momentum_buffer' in optimizer.state[ self.parameters[layer]].keys(): optimizer.state[ self.parameters[layer]]['momentum_buffer'][unit] *= 0.0 print("After: ", cur_filter) print("Target Params:", targeted_params, " Approx. Param:", cur_params)
def fit_params(iteration=None, prune_fname="filename", classes=10, model='vgg', in_channel=3, kernel=3): if iteration == None: f = open('prune_record/train.csv', newline='') else: f = open('prune_record/' + prune_fname + '_{}.csv'.format(iteration), newline='') reader = csv.reader(f, delimiter=',') filters = [] for row in reader: filters.append(list(map(int, row))) filters = np.array(filters, dtype=int) # Samples insuffcient to get good interpolation if filters.shape[0] < 6: filters = np.expand_dims(filters[0], axis=0) if filters.shape[ 0] == 1: # not all layers pruned at least once, opt for uniform scaling alpha = filters[0] beta = np.zeros(filters.shape[1]) # ======================= # save scaling parameters # ======================= pickle_save = { "train_alpha": alpha, "train_beta": beta, } if iteration != None: pickle_out = open("prune_record/param{}.pk".format(iteration), "wb") else: pickle_out = open("prune_record/param.pk", "wb") pickle.dump(pickle_save, pickle_out) pickle_out.close() return alpha, beta # Compute total parameters total_params = [] for filt in filters: # over all iterations total_params.append( compute_params(filt, classes=classes, model=model, in_channel=in_channel, kernel=kernel, last=True)) total_params = np.array(total_params) # ###### # LR (simple) # ###### ln_tau = np.log(total_params) Tau = np.stack((ln_tau, np.ones(ln_tau.shape)), axis=1) Phi = np.log(filters) Theta = np.matmul( np.matmul(np.linalg.inv(np.matmul(Tau.transpose(), Tau)), Tau.transpose()), Phi) beta = Theta[0, :] alpha = np.exp(Theta[1, :]) f.close() # ======================= # save scaling parameters # ======================= pickle_save = { "train_alpha": alpha, "train_beta": beta, } if iteration != None: pickle_out = open("prune_record/param{}.pk".format(iteration), "wb") else: pickle_out = open("prune_record/param.pk", "wb") pickle.dump(pickle_save, pickle_out) pickle_out.close() return alpha, beta