def generate_original_preds(train = True): """ Generate the predictions of the original model on training and validation datasets. The original model is also trained if train = True. """ x_train, y_train, x_val, y_val, id_to_word = load_data() model = create_original_model() if train: filepath="models/original.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] model.fit(x_train, y_train, validation_data=(x_val, y_val),callbacks = callbacks_list, epochs=epochs, batch_size=batch_size) model.load_weights('./models/original.hdf5', by_name=True) pred_train = model.predict(x_train,verbose = 1, batch_size = 1000) pred_val = model.predict(x_val,verbose = 1, batch_size = 1000) if not train: print('The val accuracy is {}'.format(calculate_acc(pred_val,y_val))) print('The train accuracy is {}'.format(calculate_acc(pred_train,y_train))) np.save('data/pred_train.npy', pred_train) np.save('data/pred_val.npy', pred_val)
def simple_pred(params): all_train_labels = load_data( os.path.join(params.save_data, "all_train_labels")) all_train_labels = all_train_labels.reshape((-1)) for ratio in params.ratios: time_start = time.time() subset_label = all_train_labels[:int(all_train_labels.shape[0] * ratio)] for layer in range(0, 2): ## ATTENTION!!! concate = [] for j in range(0, 50000, 5000): a_batch = load_data( os.path.join( params.save_data, "K_transform_batch_layer{}_batch{}".format( layer, j))) concate.append(a_batch) concate = np.concatenate(concate, axis=0) concate = concate[:int(all_train_labels.shape[0] * ratio)] # num_PCA_kernels = [80, 125] mo3 = LAFR() mo3.fit(concate, subset_label) prediction = mo3.predict(concate) save_data( prediction, os.path.join( params.save_data, "sinple_predict_ratio{}_layer{}".format(ratio, layer))) calculate_acc(prediction, subset_label) save_data( mo3, os.path.join( params.save_data, 'simple_pred_layer{}_ratio{}'.format(layer, ratio))) print("Time cost - simple_pred:", time.time() - time_start)
def generate_post_preds(train = True): """ Generate the predictions of the original model on training and validation datasets. The original model is also trained if train = True. """ x_train, y_train, x_val, y_val = np.load('data/x_train_new_2_loss.npy'),np.load('data/y_train.npy'),np.load('data/x_val_new_2_loss.npy'),np.load('data/y_val.npy') with open('data/id_to_word.pkl','rb') as f: id_to_word = pickle.load(f) model = create_original_model() if train: filepath="./models_new/post_2_loss.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] model.fit(x_train, y_train, validation_data=(x_val, y_val),callbacks = callbacks_list, epochs=epochs, batch_size=batch_size) model.load_weights('./models_new/post_2_loss.hdf5', by_name=True) pred_train = model.predict(x_train,verbose = 1, batch_size = 1000) pred_val = model.predict(x_val,verbose = 1, batch_size = 1000) if not train: print('The val accuracy is {}'.format(calculate_acc(pred_val,y_val))) print('The train accuracy is {}'.format(calculate_acc(pred_train,y_train)))
def train(net, train_loader, optimizer, loss_func): ''' Performs one training epoch of LSTM. Arguments: net (nn.Module): RNN (currently LSTM) train_loader (DataLoader): load object for train data optimizer: optimizer object for net parameters loss_func: criterion function used for backprop Returns: epoch_loss (torch.float): mean loss value for all batches epoch_acc (torch.float): mean acc value for all batches ''' net.train() epoch_loss = 0 epoch_acc = 0 for input, labels in train_loader: input, labels = input.to(device), labels.to(device) optimizer.zero_grad() output = net(input).squeeze(1) loss = loss_func(output, labels) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_acc += calculate_acc(output, labels) epoch_loss /= len(train_loader) epoch_acc /= len(train_loader) return epoch_loss, epoch_acc
def step(self, batch): self.model.train() self.optim.zero_grad() img, target = batch img, target = img.cuda(), target.cuda() # # Target inputs try: inputs_exemplar= next(self.exemplar_iter) except: self.exemplar_iter = iter(self.exemplar_dl) inputs_exemplar = next(self.exemplar_iter) img_exemplar, target_exemplar = inputs_exemplar img_exemplar, target_exemplar = img_exemplar.cuda(), target_exemplar.cuda() # source img = torch.cat([img,img_exemplar],dim=0) outputs = self.model(img) loss = self.loss_func(outputs, target) # if self.mix_precision: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optim.step() # acc = (score.max(1)[1] == target).float().mean() acc = calculate_acc(self.cfg, outputs, target) self.loss_avg.update(loss.cpu().item()) self.acc_avg.update(acc.cpu().item()) return self.loss_avg.avg, self.acc_avg.avg
def step(self, batch): self.model.train() self.optim.zero_grad() img, target, histlabels = batch img, target, histlabels = img.cuda(), target.cuda(), histlabels.cuda() outputs = self.model(img) loss, tpl, ce, hlce = self.loss_func(outputs, target, histlabels, in_detail=True) if self.current_iteration % self.cfg.SOLVER.TENSORBOARD.LOG_PERIOD == 0: if self.summary_writer: self.summary_writer.add_scalar('Train/tpl', tpl, self.current_iteration) self.summary_writer.add_scalar('Train/ce', ce, self.current_iteration) self.summary_writer.add_scalar('Train/hlce', hlce, self.current_iteration) if self.mix_precision: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optim.step() # acc = (score.max(1)[1] == target).float().mean() acc = calculate_acc(self.cfg, outputs, target) self.loss_avg.update(loss.cpu().item()) self.acc_avg.update(acc.cpu().item()) return self.loss_avg.avg, self.acc_avg.avg
def train_net(model, loss, config, inputs, labels, batch_size, disp_freq): iter_counter = 0 loss_list = [] acc_list = [] for input, label in data_iterator(inputs, labels, batch_size): target = onehot_encoding(label, 10) iter_counter += 1 # forward net output = model.forward(input) # calculate loss loss_value = loss.forward(output, target) # generate gradient w.r.t loss grad = loss.backward(output, target) # backward gradient model.backward(grad) # update layers' weights model.update(config) acc_value = calculate_acc(output, label) loss_list.append(loss_value) acc_list.append(acc_value) if iter_counter % disp_freq == 0: msg = ' Training iter %d, batch loss %.4f, batch acc %.4f' % (iter_counter, np.mean(loss_list), np.mean(acc_list)) loss_list = [] acc_list = [] LOG_INFO(msg)
def test_net(model, loss, inputs, labels, batch_size, epoch, layer_name): loss_list = [] acc_list = [] for input, label in data_iterator(inputs, labels, batch_size, shuffle=False): target = onehot_encoding(label, 10) output, output_visualize = model.forward(input, visualize=True, layer_name=layer_name) # collapse output_visualize into 1 channel output_visualize = np.sum(output_visualize, axis=(1)) loss_value = loss.forward(output, target) acc_value = calculate_acc(output, label) loss_list.append(loss_value) acc_list.append(acc_value) msg = ' Testing, total mean loss %.5f, total acc %.5f' % ( np.mean(loss_list), np.mean(acc_list)) LOG_INFO(msg) # save weights and biases model.save_weights(loss.name, epoch) return np.mean(loss_list), np.mean( acc_list ), output_visualize # output_visualize: batch_size x height x width
def train_net(model, loss, config, inputs, labels, batch_size, disp_freq): iter_counter = 0 loss_list = [] acc_list = [] for input, label in data_iterator(inputs, labels, batch_size): target = onehot_encoding(label, 10) iter_counter += 1 # forward net output = model.forward(input) # calculate loss loss_value = loss.forward(output, target) # generate gradient w.r.t loss grad = loss.backward(output, target) # backward gradient model.backward(grad) # update layers' weights model.update(config) acc_value = calculate_acc(output, label) loss_list.append(loss_value) acc_list.append(acc_value) if iter_counter % disp_freq == 0: msg = ' Training iter %d, batch loss %.4f, batch acc %.4f' % ( iter_counter, np.mean(loss_list), np.mean(acc_list)) loss_list = [] acc_list = [] LOG_INFO(msg)
def evaluate(net, set_loader, loss_func): ''' Evaluates the performance of the RNN on the given set. Arguments: net (nn.Module): RNN (currently LSTM) set_loader (DataLoader): load object for val/test data loss_func: criterion function used for backprop Returns: eval_loss (torch.float): mean loss value for all batches eval_acc (torch.float): mean acc value for all batches ''' net.eval() eval_loss = 0 eval_acc = 0 for input, labels in set_loader: input, labels = input.to(device), labels.to(device) output = net(input).squeeze(1) loss = loss_func(output, labels) eval_loss += loss.item() eval_acc += calculate_acc(output, labels) eval_loss /= len(set_loader) eval_acc /= len(set_loader) return eval_loss, eval_acc
def step(self, batch): self.model.train() self.optim.zero_grad() # img, target = batch img, target = img.cuda(), target.cuda() outputs = self.model(img) # if self.cfg.SOLVER.MIXUP.USE: mx_img, mx_target1, mx_target2, lamb = self.posneg_mixup( img, target, self.cfg.DATALOADER.NUM_INSTANCE, self.cfg.SOLVER.MIXUP.NEG_INSTANCE, self.cfg.SOLVER.MIXUP.ALPHA) mx_outputs = self.model(mx_img) loss = self.loss_func(outputs, target, mx_outputs, mx_target1, mx_target2, lamb) else: loss = self.loss_func(outputs, target) if self.mix_precision: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optim.step() # acc = (score.max(1)[1] == target).float().mean() acc = calculate_acc(self.cfg, outputs, target) self.loss_avg.update(loss.cpu().item()) self.acc_avg.update(acc.cpu().item()) return self.loss_avg.avg, self.acc_avg.avg
def test_net(model, loss, inputs, labels, batch_size): acc_value = 0.0 count = 0 for input, label in data_iterator(inputs, labels, batch_size, shuffle=False): output = model.forward(input) acc_value += calculate_acc(output, label) count += 1 return acc_value / count
def test_net(model, loss, input_feats, labels, test_mask, label_kind): target = onehot_encoding(labels, label_kind) output = model.forward(input_feats) # set mask output[~test_mask] = target[~test_mask] loss_value = loss.forward(output, target) acc_value = calculate_acc(output, labels, np.sum(test_mask)) msg = ' Testing, total mean loss %.5f, total acc %.5f' % (loss_value, acc_value) LOG_INFO(msg)
def train_net(model, loss, config, inputs, labels, batch_size, disp_freq, Loss, Acur): iter_counter = 0 loss_list = [] acc_list = [] ll = [] ac = [] for input, label in data_iterator(inputs, labels, batch_size): target = onehot_encoding(label, 10) iter_counter += 1 # forward net output = model.forward(input) # calculate loss loss_value = loss.forward(output, target) # generate gradient w.r.t loss grad = loss.backward(output, target) # backward gradient model.backward(grad) if loss_value > 1: config['learning_rate'] = 0.2 elif loss_value > 0.5: config['learning_rate'] = 0.1 elif loss_value > 0.2: config['learning_rate'] = 0.05 else: config['learning_rate'] = max(loss_value / 5.0, 0.005) # update layers' weights model.update(config) acc_value = calculate_acc(output, label) loss_list.append(loss_value) acc_list.append(acc_value) ll.append(loss_value) ac.append(acc_value) if iter_counter % disp_freq == 0: msg = ' Training iter %d, batch loss %.4f, batch acc %.4f' % ( iter_counter, np.mean(loss_list), np.mean(acc_list)) Loss.append(np.mean(loss_list)) Acur.append(np.mean(acc_list)) loss_list = [] acc_list = [] LOG_INFO(msg) Loss.append(np.mean(ll)) Acur.append(np.mean(ac))
def test_net(model, loss, inputs, labels, batch_size): loss_list = [] acc_list = [] for input, label in data_iterator(inputs, labels, batch_size, shuffle=False): target = onehot_encoding(label, 10) output = model.forward(input) loss_value = loss.forward(output, target) acc_value = calculate_acc(output, label) loss_list.append(loss_value) acc_list.append(acc_value) msg = ' Testing, total mean loss %.5f, total acc %.5f' % (np.mean(loss_list), np.mean(acc_list)) LOG_INFO(msg)
def step(self, batch): self.model.train() self.optim.zero_grad() self.center_optim.zero_grad() img, target = batch img, target = img.cuda(), target.cuda() if self.cfg.MODEL.USE_COS: outputs = self.model(img, target) else: outputs = self.model(img) loss, tpl, ce, ct = self.loss_func(outputs, target, in_detail=True) if self.current_iteration % self.cfg.SOLVER.TENSORBOARD.LOG_PERIOD == 0: if self.summary_writer: self.summary_writer.add_scalar('Train/tpl', tpl, self.current_iteration) self.summary_writer.add_scalar('Train/ce', ce, self.current_iteration) self.summary_writer.add_scalar('Train/ct', ct, self.current_iteration) if self.mix_precision: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optim.step() if self.mix_precision: # [todo] fix the center's step pass # for param in amp.master_params(self.center_optim): # param.grad.data *= (0.5 / self.cfg.SOLVER.CENTER_LOSS.WEIGHT) else: for param in self.loss_func.center_criterion.parameters(): # param.grad.data *= (1.0 / self.cfg.SOLVER.CENTER_LOSS.WEIGHT) param.grad.data *= (self.cfg.SOLVER.CENTER_LOSS.ALPHA / self.cfg.SOLVER.CENTER_LOSS.WEIGHT) self.center_optim.step() # acc = (score.max(1)[1] == target).float().mean() acc = calculate_acc(self.cfg, outputs, target) self.loss_avg.update(loss.cpu().item()) self.acc_avg.update(acc.cpu().item()) return self.loss_avg.avg, self.acc_avg.avg
def train_net(model, loss, config, inputs, labels, batch_size, disp_freq, test_inputs, test_labels): iter_counter = 0 train_loss_list, train_acc_list = [], [] test_loss_list, test_acc_list = [], [] # loss_list, acc_list = [], [] for input, label in data_iterator(inputs, labels, batch_size): # train_loss_value, train_acc_value = test_net(model, loss, input, labels, 10000000) # train_loss_list.append(train_loss_value) # train_acc_list.append(train_acc_value) test_loss_value, test_acc_value = test_net(model, loss, test_inputs, test_labels, 10000000) test_loss_list.append(test_loss_value) test_acc_list.append(test_acc_value) target = onehot_encoding(label, 10) iter_counter += 1 # forward net output = model.forward(input) # calculate loss loss_value = loss.forward(output, target) # generate gradient w.r.t loss grad = loss.backward(output, target) # backward gradient model.backward(grad) # update layers' weights model.update(config) acc_value = calculate_acc(output, label) # loss_list.append(loss_value) # acc_list.append(acc_value) train_loss_list.append(loss_value) train_acc_list.append(acc_value) # if iter_counter % disp_freq == 0: # msg = ' Training iter %d, batch loss %.4f, batch acc %.4f' % (iter_counter, np.mean(loss_list), np.mean(acc_list)) # loss_list = [] # acc_list = [] # LOG_INFO(msg) return train_loss_list, train_acc_list, test_loss_list, test_acc_list
def test_net(model, loss, inputs, labels, batch_size): loss_list = [] acc_list = [] # test model with all the test data for input, label in data_iterator(inputs, labels, batch_size, shuffle=False): # get the expected value of this batch of input target = onehot_encoding(label, 10) output = model.forward(input) # calculate loss of this batch loss_value = loss.forward(output, target) acc_value = calculate_acc(output, label) loss_list.append(loss_value) acc_list.append(acc_value) # use the mean of all batch's loss and accuracy as the final result msg = ' Testing, total mean loss %.5f, total acc %.5f' % (np.mean(loss_list), np.mean(acc_list)) LOG_INFO(msg)
def train_net(model, loss, config, inputs, labels, batch_size, disp_freq, loss_file): iter_counter = 0 loss_list = [] acc_list = [] for input, label in data_iterator(inputs, labels, batch_size): target = onehot_encoding(label, 10) iter_counter += 1 # print "Debug: ", "input=", input.shape, " target=", target.shape # forward net output = model.forward(input) # calculate loss loss_value = loss.forward(output, target) # generate gradient w.r.t loss grad = loss.backward(output, target) # backward gradient model.backward(grad) # update layers' weights model.update(config) acc_value = calculate_acc(output, label) loss_list.append(loss_value) acc_list.append(acc_value) ''' outf = file(loss_file, "a") outf.write(str(loss_value) + ' ' + str(acc_value) + '\n') outf.close() ''' if iter_counter % disp_freq == 0: msg = ' Training iter %d, batch loss %.4f, batch acc %.4f' % (iter_counter, np.mean(loss_list), np.mean(acc_list)) outf = file(loss_file, "a") outf.write(str(np.mean(loss_list)) + ' ' + str(np.mean(acc_list)) + '\n') outf.close() loss_list = [] acc_list = [] LOG_INFO(msg)
def step(self, batch): self.model.train() self.optim.zero_grad() img, target = batch img, target = img.cuda(), target.cuda() # # Target inputs try: inputs_exemplar = next(self.exemplar_iter) except: self.exemplar_iter = iter(self.exemplar_dl) inputs_exemplar = next(self.exemplar_iter) img_exemplar, target_exemplar = inputs_exemplar img_exemplar, target_exemplar = img_exemplar.cuda( ), target_exemplar.cuda() # source outputs = self.model(img) loss = self.loss_func(outputs, target) # exemplar_outputs = self.model(img_exemplar, 'exemplar_feat') loss_un = self.exemplar_memory(exemplar_outputs, target_exemplar, epoch=self.train_epoch) loss = (1 - self.cfg.DATASETS.EXEMPLAR.MEMORY.LAMBDA ) * loss + self.cfg.DATASETS.EXEMPLAR.MEMORY.LAMBDA * loss_un if self.mix_precision: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optim.step() # acc = (score.max(1)[1] == target).float().mean() acc = calculate_acc(self.cfg, outputs, target) self.loss_avg.update(loss.cpu().item()) self.acc_avg.update(acc.cpu().item()) return self.loss_avg.avg, self.acc_avg.avg
def train_net(model, loss, config, inputs, labels, batch_size, disp_freq, Loss, Acur): iter_counter = 0 loss_list = [] acc_list = [] ll = [] ac = [] # train model with for input, label in data_iterator(inputs, labels, batch_size): target = onehot_encoding(label, 10) iter_counter += 1 # forward net output = model.forward(input) # calculate loss value of the whole batch loss_value = loss.forward(output, target) # generate gradient w.r.t loss, this is actually the local gradient contribution of the output layer grad = loss.backward(output, target) # backward gradient model.backward(grad) # update layers' weights: recount after the whole backward procedure model.update(config) acc_value = calculate_acc(output, label) loss_list.append(loss_value) ll.append(loss_value) acc_list.append(acc_value) ac.append(acc_value) if iter_counter % disp_freq == 0: msg = ' Training iter %d, batch loss %.4f, batch acc %.4f' % (iter_counter, np.mean(loss_list), np.mean(acc_list)) loss_list = [] acc_list = [] LOG_INFO(msg) Loss.append(np.mean(ll)) Acur.append(np.mean(ac))
def train_net(model, loss, config, input_feats, labels, train_mask, label_kind): target = onehot_encoding(labels, label_kind) # forward net output = model.forward(input_feats) # set mask output[~train_mask] = target[~train_mask] # calculate loss loss_value = loss.forward(output, target) # generate gradient w.r.t loss grad = loss.backward(output, target) # backward gradient model.backward(grad) # update layers' weights model.update(config) acc_value = calculate_acc(output, labels, np.sum(train_mask)) msg = ' Training batch loss %.4f, batch acc %.4f' % (loss_value, acc_value) LOG_INFO(msg)
def step(self, batch): self.model.train() self.optim.zero_grad() img, target = batch img, target = img.cuda(), target.cuda() outputs = self.model(img) loss = self.loss_func(outputs, target) if self.mix_precision: with amp.scale_loss(loss, self.optim) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optim.step() # acc = (score.max(1)[1] == target).float().mean() acc = calculate_acc(self.cfg, outputs, target) self.loss_avg.update(loss.cpu().item()) self.acc_avg.update(acc.cpu().item()) return self.loss_avg.avg, self.acc_avg.avg
def main(cfg): # setting up output directories, and writing to stdout make_dirs(cfg.stdout_dir, replace=False) if cfg.train: run_type = 'train' else: if 'weight' in cfg.prune_type.lower(): run_type = 'weight-prune' else: run_type = 'unit-prune' sys.stdout = open( '{}/stdout_{}_{}.txt'.format(cfg.stdout_dir, cfg.model_name, run_type), 'w') print(cfg) print('\n') sys.stdout.flush() # if train mode, replace the previous plot and ckpt directories; if in prune mode, use existing directories if cfg.plot: make_dirs(os.path.join(cfg.plot_dir, cfg.model_name), replace=cfg.train) if cfg.save_model: make_dirs(os.path.join(cfg.model_dir, cfg.model_name), replace=cfg.train) # set random seed if cfg.random_seed != 0: random_seed = cfg.random_seed else: random_seed = random.randint(1, 100000) random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) # set device as cuda or cpu if cfg.use_gpu and torch.cuda.is_available(): # reproducibility using cuda torch.cuda.manual_seed(random_seed) cudnn.deterministic = True cudnn.benchmark = False device = torch.device('cuda') else: device = torch.device('cpu') if cfg.use_gpu: print('gpu option was to <True>, but no cuda device was found') print('\n') # datasets and dataloaders # normalizing training and validation images to [0, 1] suffices for the purposes of our research objective # in training, <drop_last> minibatch in an epoch set to <True> for simplicity in tracking training performance dataset_train = MNIST(root='./data/mnist', train=True, download=True, transform=transforms.Compose([transforms.ToTensor() ]), target_transform=None) dataloader_train = DataLoader(dataset=dataset_train, batch_size=cfg.batch_size, shuffle=cfg.shuffle, num_workers=cfg.num_workers, pin_memory=True, drop_last=True) dataset_val = MNIST(root='./data/mnist', train=False, download=True, transform=transforms.Compose([transforms.ToTensor()]), target_transform=None) dataloader_val = DataLoader(dataset=dataset_val, batch_size=100, shuffle=False, num_workers=cfg.num_workers, pin_memory=True, drop_last=False) # automatically compute number of classes targets = np.asarray(dataset_train.targets) c = np.unique(targets).shape[0] # define model # weights initialized using Kaiming uniform (He initialization) # number of units per hidden layer is passed in as an argument net = Net(np.product(cfg.img_size), c, cfg.units).to(device) criterion = nn.CrossEntropyLoss() if cfg.train: # training mode if cfg.use_sgd: optimizer = optim.SGD(params=net.parameters(), lr=cfg.lr, momentum=cfg.momentum, nesterov=cfg.use_nesterov) else: optimizer = optim.Adam(params=net.parameters(), lr=cfg.lr, betas=(cfg.beta1, cfg.beta2)) # tracking training and validation stats over epochs epochs = [] train_loss_epochs, val_loss_epochs = [], [] train_acc_epochs, val_acc_epochs = [], [] # best model is defined as model with best performing validation loss best_loss = float('inf') for epoch in range(cfg.epochs): # tracking training and validation stats over a given epoch train_loss_epoch, val_loss_epoch = [], [] train_acc_epoch, val_acc_epoch = [], [] # training set for i, (x, y) in enumerate(dataloader_train): x, y = x.to(device), y.to(device) optimizer.zero_grad() logits = net(x) loss = criterion(logits, y) loss.backward() optimizer.step() acc = calculate_acc(logits, y) append((train_loss_epoch, loss.item()), (train_acc_epoch, acc.item())) # validation set with torch.no_grad(): for i, (x, y) in enumerate(dataloader_val): x, y = x.to(device), y.to(device) logits = net(x) loss = criterion(logits, y) acc = calculate_acc(logits, y) append((val_loss_epoch, loss.item()), (val_acc_epoch, acc.item())) train_loss_epoch, val_loss_epoch = get_average( train_loss_epoch), get_average(val_loss_epoch) train_acc_epoch, val_acc_epoch = get_average( train_acc_epoch), get_average(val_acc_epoch) print('train_epoch{:0=3d}_loss{:.4f}_acc{:.4f}'.format( epoch + 1, train_loss_epoch, train_acc_epoch)) print('valid_epoch{:0=3d}_loss{:.4f}_acc{:.4f}'.format( epoch + 1, val_loss_epoch, val_acc_epoch)) print('\n') sys.stdout.flush() if cfg.plot: append((epochs, epoch + 1), (train_loss_epochs, train_loss_epoch), (val_loss_epochs, val_loss_epoch), (train_acc_epochs, train_acc_epoch), (val_acc_epochs, val_acc_epoch)) plot_line(epochs, train_loss_epochs, val_loss_epochs, 'Epoch Number', 'Loss', cfg) plot_line(epochs, train_acc_epochs, val_acc_epochs, 'Epoch Number', 'Accuracy', cfg) if val_loss_epoch < best_loss: best_loss = val_loss_epoch print('New best model at epoch {:0=3d} with val_loss {:.4f}'. format(epoch + 1, best_loss)) print('\n') if cfg.save_model: # save model when validation loss improves save_name = '{}_net_epoch{:0=3d}_val_loss{:.4f}'.format( cfg.model_name, epoch + 1, best_loss) torch.save( net.state_dict(), os.path.join(cfg.model_dir, cfg.model_name, '{}.pth'.format(save_name))) with open( os.path.join(cfg.model_dir, cfg.model_name, '{}.txt'.format(cfg.model_name)), 'w') as file: file.write('{}.pth'.format(save_name)) else: # pruning mode # checks on arguments passed in for k in cfg.sparsity: assert 0 <= k <= 1 if cfg.use_sparse_mul: assert cfg.to_sparse # load model with open( os.path.join(cfg.model_dir, cfg.model_name, '{}.txt'.format(cfg.model_name)), 'r') as file: load_name = file.readline() net.load_state_dict( torch.load( os.path.join(cfg.model_dir, cfg.model_name, '{}'.format(load_name)))) net.eval() # select pruning approach to use if 'weight' in cfg.prune_type.lower(): prune = weight_prune else: prune = unit_prune sparsities = [] val_loss_sparse, val_acc_sparse = [], [] time_sparsities = [] for k in cfg.sparsity: val_loss_k, val_acc_k = [], [] time_k = [] # copy network so that the sparsity changes are not additive for each k net_sparse = copy.deepcopy(net) pruned_weights = [] # prune model, except for the last layer for (i, p) in enumerate(net_sparse.parameters()): if i < len(cfg.units): original_weights = copy.deepcopy(p.data) if cfg.plot: # plot magnitude of original weights (for comparison to post-pruned weights) plot_hist([ torch.abs( original_weights.flatten()).cpu().numpy() ], ['b'], cfg.prune_type, i + 1, k, 'Non-Pruned Weight Magnitudes', 'Counts', cfg) prune(p.data, k) if cfg.plot: # plot original magnitudes of pruned weights, and magnitudes of remaining weights, separately pruned_weights_non_zero = torch.abs( original_weights.flatten()[p.data.flatten() != 0]) pruned_weights_zeroed = torch.abs( original_weights.flatten()[p.data.flatten() == 0]) plot_hist([ pruned_weights_non_zero.cpu().numpy(), pruned_weights_zeroed.cpu().numpy() ], ['g', 'r'], cfg.prune_type, i + 1, k, 'Weight Magnitudes', 'Counts', cfg) plot_hist([pruned_weights_non_zero.cpu().numpy()], ['k'], cfg.prune_type, i + 1, k, 'Surviving Weight Magnitudes', 'Counts', cfg) if cfg.to_sparse and i < len(cfg.units): pruned_weights.append(p.data.to_sparse()) else: pruned_weights.append(p.data) with torch.no_grad(): for i, (x, y) in enumerate(dataloader_val): x, y = x.to(device), y.to(device) start = time.time() logits = forward(x, pruned_weights, cfg.use_sparse_mul) end = time.time() loss = criterion(logits, y) acc = calculate_acc(logits, y) append((val_loss_k, loss.item()), (val_acc_k, acc.item()), (time_k, end - start)) val_loss_k, val_acc_k, time_k = get_average( val_loss_k), get_average(val_acc_k), get_average(time_k) print('valid_{}_k{:.2f}_loss{:.4f}_acc{:.4f}'.format( run_type, k, val_loss_k, val_acc_k)) print('valid_{}_k{:.2f}_time/minibatch{:.6f}'.format( run_type, k, time_k)) print('\n') sys.stdout.flush() if cfg.plot: append((sparsities, k), (val_loss_sparse, val_loss_k), (val_acc_sparse, val_acc_k), (time_sparsities, time_k)) plot_line(sparsities, [], val_loss_sparse, 'Sparsity {} Prune'.format(cfg.prune_type), 'Loss', cfg) plot_line(sparsities, [], val_acc_sparse, 'Sparsity {} Prune'.format(cfg.prune_type), 'Accuracy', cfg) plot_line(sparsities, [], time_sparsities, 'Sparsity {} Prune'.format(cfg.prune_type), 'Time', cfg) if cfg.save_model: torch.save( net_sparse.state_dict(), os.path.join( cfg.model_dir, cfg.model_name, '{}_sparse_net_{}_val_loss{:.4f}.pth'.format( cfg.model_name, run_type, val_loss_k)))
test_outputs = rnn_model(test_x) elif 'gru' in args.rnn_model_type: test_outputs, _, _ = rnn_model(test_x) elif 'lstm' in args.rnn_model_type: test_outputs, _, _ = rnn_model(test_x) test_outputs = test_outputs.view( test_x.size(1), num_class) if use_loss_weights: test_loss = rnn_loss_function( test_outputs, test_labels, weight=loss_weights) else: test_loss = rnn_loss_function( test_outputs, test_labels) test_average_loss += test_loss.item() curr_correct, curr_total, corr_labels, incorr_labels = calculate_acc( test_outputs, test_labels) correct_labels.extend(corr_labels.tolist()) incorrect_labels.extend(incorr_labels.tolist()) total += curr_total correct += curr_correct _ = print_per_label_accu(Counter(correct_labels), Counter( incorrect_labels), test_state_map) test_average_loss /= len(test_loader) accuracy = float(correct) / total print('[INFO][Test] Testing loss: {}. Overall testing accuracy: {}'.format( test_average_loss, accuracy)) rnn_model.train() # Now returning to train model else: rnn_model = torch.load(args.rnn_model).to(device)
for layer in range(0, 2): ## ATTENTION!!! K_transform = load_data(os.path.join(params.save_data, "K_transform_layer{}".format(layer))) batch_size = 5000 con = [] for b in range(0, ph_out[0].shape[0], batch_size): data = ph_out[layer][b: b + batch_size] data = K_transform.predict(data) # mean1 = np.mean(data, axis=1, keepdims=True) # data = mean1 - data # data = np.where(data < 0, 0, data) # data = np.sum(data, axis=1) con.append(data) con = np.concatenate(con, axis=0) mo3 = load_data(os.path.join(params.save_data, 'simple_pred_layer{}_ratio{}'.format(layer, ratio))) prediction = mo3.predict(con) calculate_acc(prediction, test_labels) concate.append(prediction) for layer in range(2, params.num_layers): ## ATTENTION!!! data = ph_out[layer] data = np.reshape(data, newshape=(data.shape[0], -1)) lag = load_data(os.path.join(params.save_data, 'LAG_{}_{}'.format(layer, ratio))) lag_pred = lag.predict_proba(data) concate.append(lag_pred) concate = np.concatenate(concate, axis=1) print("Concate shape:", concate.shape) rf = load_data(os.path.join(params.save_data, 'RF_{}'.format(ratio))) prediction = rf.predict(concate) # calculate_acc(prediction, subset_label) print("ACC=", np.sum(prediction.reshape((-1)) == test_labels.reshape((-1))) / test_labels.shape[0] * 100)
def cross_validation(k, X, y, params, regression): """ Performing regression using K-Cross Validation. This function is used to generate a model, given data, a regression function and a set of parameters. Args: k (int): k for cross validation X (nd.array): training samples of form N x D y (nd.array): training samples of form N params (dict): dictionary of training samples regression (function): regression function Returns: float: mean loss on validation datasets float: mean accuracy on validation datasets Raise: ValueError: if the regression function raises an error """ # Cross-validation k_indices = build_k_indices(y, k) accuracies = [] losses = [] # print(f"(max_iters: {params['max_iters']}, gamma: {params['gamma']}, lambda: {params['lambda_']})") # each iteration for each split of training and validation for k_iteration in range(k): # split the data accordingly into training and validation X_train, Y_train, X_val, Y_val = cross_validation_iter( y, X, k_indices, k_iteration) # initial weights W_init = np.random.rand(D, ) # initialize dictionary for the training regression model args_train = { "tx": X_train, "y": Y_train, "initial_w": W_init, "max_iters": params["max_iters"], "gamma": params["gamma"], "lambda_": params["lambda_"] } # try to train the model, if this doesnt work, raise an error try: W, loss_tr = regression(**args_train) except ValueError: print("Regression diverged with these parameters.") return None, None if "Logistic" in f_name: prediction_val_regression = sigmoid(X_val @ W) else: prediction_val_regression = X_val @ W # calculate prediction for the validation dataset prediction_val = create_labels(prediction_val_regression) # calculate corresponding loss and accuracy loss_val = calculate_mse_loss(Y_val, prediction_val) acc_val = calculate_acc(Y_val, prediction_val) losses.append(loss_val) accuracies.append(acc_val) # finally, generate the means mean_loss_val = np.array(losses).mean() mean_acc_val = np.array(accuracies).mean() return mean_loss_val, mean_acc_val
# validation_loss = 0.0 # for j, data in enumerate(testloader): # (10,000 / args.batch) batches # inputs, labels = data # inputs = inputs.to(device) # labels = labels.to(device) # outputs = net(inputs) # loss = criterion(outputs, labels) # validation_loss += loss.item() # Calculate training accuracy, top-1 # train_acc = calculate_acc(trainloader, net, device) # Calculate validation accuracy net.eval() val_acc = calculate_acc(testloader, net, device) if val_acc > stats['best_acc']: stats['best_acc'] = val_acc stats['best_epoch'] = epoch + 1 if args.save: # Save the checkpoint state = { 'epoch': epoch, 'optimizer': optimizer.state_dict(), 'net': net.state_dict(), 'stats': stats } torch.save(state, checkpoint_path) # Switch back to training mode net.train()
def event_tagger(): # Read event data en_train = read_event_data('en/train.txt') en_dev = read_event_data('en/dev.txt') en_test = read_event_data('en/test.txt') it_train = read_event_data('it/train.txt') it_dev = read_event_data('it/dev.txt') it_test = read_event_data('it/test.txt') print('English TimeML:', len(en_train), len(en_dev), len(en_test)) print('Italian News:', len(it_train), len(it_dev), len(it_test)) tags = list(set(word_label[1] for sent in it_train for word_label in sent)) print(len(tags)) # By convention, the 0'th slot is reserved for padding. tags = ["<pad>"] + tags tag2idx = {tag: idx for idx, tag in enumerate(tags)} idx2tag = {idx: tag for idx, tag in enumerate(tags)} print(tag2idx) print(idx2tag) device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) model = Net(vocab_size=len(tag2idx), device=device) model.to(device) model = nn.DataParallel(model) # One fine-tuning step train_dataset = EventDataset(en_train, tokenizer, tag2idx) train_iter = data.DataLoader(dataset=train_dataset, batch_size=8, shuffle=True, num_workers=1, collate_fn=pad) eval_dataset = EventDataset(it_test, tokenizer, tag2idx) test_iter = data.DataLoader(dataset=eval_dataset, batch_size=8, shuffle=False, num_workers=1, collate_fn=pad) criterion = nn.CrossEntropyLoss(ignore_index=0) num_epoch = 1 base_lr = 0.001 decay_factor = 0.2 discriminative_fine_tuning = True gradual_unfreezing = False # params order top to bottom group_to_discriminate = ['classifier', 'bert'] no_decay = ['bias', 'LayerNorm.weight'] if discriminative_fine_tuning: optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not 'bert' in n ], 'lr': 0.001, 'name': 'classifier.decay', 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not 'bert' in n ], 'lr': 0.001, 'name': 'classifier.no_decay', 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'bert' in n ], 'lr': 0.00002, 'name': 'bert.decay', 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and 'bert' in n ], 'layers': [ n for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and 'bert' in n ], 'lr': 0.00002, 'name': 'bert.no_decay', 'weight_decay': 0.0 }] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=len(train_iter) * num_epoch // 10, t_total=len(train_iter) * num_epoch) for e in range(num_epoch): unfreeze = (True, False)[e != 0] if discriminative_fine_tuning and gradual_unfreezing: for pg in optimizer.param_groups: layers = '' for layer in pg['layers']: layers += layer + ';' # print('epoch: {}, Layers: {}'.format(e, layers)) if 'bert' in pg['name']: for param in pg['params']: param.requires_grad = unfreeze loss = train(model, train_iter, optimizer, scheduler, criterion) acc = eval(model, test_iter, idx2tag) print("epoch: {}, loss: {}".format(e, loss)) print("epoch: {}, acc: {}".format(e, acc)) ''' ## Second fine-tuning step (epoch=1) train_dataset = EventDataset(it_train, tokenizer, tag2idx) for e in range(num_epoch): unfreeze = (True, False)[e != 0] if discriminative_fine_tuning and gradual_unfreezing: for pg in optimizer.param_groups: layers = '' for layer in pg['layers']: layers += layer + ';' # print('epoch: {}, Layers: {}'.format(e, layers)) if 'bert' in pg['name']: for param in pg['params']: param.requires_grad = unfreeze loss = train(model, train_iter, optimizer, scheduler, criterion) acc = eval(model, test_iter, idx2tag) print("epoch: {}, loss: {}".format(e, loss)) print("epoch: {}, acc: {}".format(e, acc)) ''' calculate_acc() calculate_f1()