num_classes=num_classes) if torch.cuda.device_count() > 1: model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]).to(device) model_T = nn.DataParallel(model_T, device_ids=[0, 1, 2, 3]).to(device) else: model = model.to(device) model_T = model_T.to(device) num_params = (sum(p.numel() for p in model.parameters()) / 1000000.0) logging.info('Total params: %.2fM' % num_params) # Loss and optimizer(SGD with 0.9 momentum) criterion = nn.CrossEntropyLoss() if args.loss == "KL": criterion_T = utils.KL_Loss(args.temperature) elif args.loss == "CE": criterion_T = utils.CE_Loss(args.temperature).to(device) accuracy = utils.accuracy optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay=args.wd) # Train the model logging.info("Starting training for {} epoch(s)".format(args.num_epochs)) train_and_evaluate(model, model_T, train_loader, test_loader, optimizer, criterion, criterion_T, accuracy, model_dir, args) logging.info('Total time: {:.2f} hours'.format(
elif "densenet" in args.model: model_cfg = getattr(model_fd, 'densenet_GL') model = getattr(model_cfg, args.model)(num_classes = num_classes, num_branches = args.num_branches) if torch.cuda.device_count() > 1: model = nn.DataParallel(model, device_ids=[0,1,2,3]).to(device) else: model = model.to(device) num_params = (sum(p.numel() for p in model.parameters())/1000000.0) logging.info('Total params: %.2fM' % num_params) # Loss and optimizer(SGD with 0.9 momentum) criterion = nn.CrossEntropyLoss() if args.loss == "KL": criterion_T = utils.KL_Loss(args.temperature).to(device) elif args.loss == "CE": criterion_T = utils.CE_Loss(args.temperature).to(device) accuracy = utils.accuracy optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay = args.wd) # Train the model logging.info("Starting training for {} epoch(s)".format(args.num_epochs)) train_and_evaluate(model, train_loader, test_loader, optimizer, criterion, criterion_T, accuracy, model_dir, args) logging.info('Total time: {:.2f} hours'.format((time.time() - begin_time)/3600.0)) state['Total params'] = num_params params_json_path = os.path.join(model_dir, "parameters.json") # save parameters utils.save_dict_to_json(state, params_json_path)
def train(self): best_acc = 0 accuracy_dict = {} for batch in tqdm(range(self.num_epochs)): for _ in range(self.ng): self.generator_optimizer.zero_grad() # generate guassian noise z = torch.randn((128, 100)).to(self.device) # get generator output psuedo_datapoint = self.generator(z) # teacher/student outputs: logits, attention1, attention2, attention3 # compute model output, fetch teacher/student output, and compute KD loss student_logits = self.student_model(psuedo_datapoint)[0] teacher_logits = self.teacher_model(psuedo_datapoint)[0] generator_loss = -(utils.KL_Loss(student_logits, teacher_logits)) generator_loss.backward() # performs updates using calculated gradients torch.nn.utils.clip_grad_norm_(self.generator.parameters(), 5) self.generator_optimizer.step() psuedo_datapoint = psuedo_datapoint.detach() with torch.no_grad(): teacher_outputs = self.teacher_model(psuedo_datapoint) for _ in range(self.ns): self.student_optimizer.zero_grad() # teacher/student outputs: logits, attention1, attention2, attention3 # compute model output, fetch teacher/student output, and compute KD loss student_outputs = self.student_model(psuedo_datapoint) # student_loss = KL_Loss(teacher_logits, teacher_outputs) student_loss = utils.student_loss_zero_shot(student_outputs, teacher_outputs) student_loss.backward() # performs updates using calculated gradients torch.nn.utils.clip_grad_norm_(self.student_model.parameters(), 5) self.student_optimizer.step() if (batch + 1) % self.log_num == 0 or (batch + 1) == self.num_epochs: acc = self.test() print(f"\nAccuracy: {acc:05.3f}") print(f'Student Loss: {student_loss:05.3f}') utils.writeMetrics({"accuracy": acc}, self.acc_counter) accuracy_dict[batch] = acc utils.log_accuracy("zero_shot.csv", accuracy_dict) self.acc_counter += 1 self.save_model() if acc > best_acc: best_acc = acc utils.writeMetrics({"Student Loss": student_loss, "Generator Loss": generator_loss}, self.counter) self.counter += 1 self.cosine_annealing_generator.step() self.cosine_annealing_student.step()