Example #1
0
                                                 num_classes=num_classes)

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]).to(device)
        model_T = nn.DataParallel(model_T, device_ids=[0, 1, 2, 3]).to(device)
    else:
        model = model.to(device)
        model_T = model_T.to(device)

    num_params = (sum(p.numel() for p in model.parameters()) / 1000000.0)
    logging.info('Total params: %.2fM' % num_params)

    # Loss and optimizer(SGD with 0.9 momentum)
    criterion = nn.CrossEntropyLoss()
    if args.loss == "KL":
        criterion_T = utils.KL_Loss(args.temperature)
    elif args.loss == "CE":
        criterion_T = utils.CE_Loss(args.temperature).to(device)
    accuracy = utils.accuracy
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          nesterov=True,
                          weight_decay=args.wd)

    # Train the model
    logging.info("Starting training for {} epoch(s)".format(args.num_epochs))
    train_and_evaluate(model, model_T, train_loader, test_loader, optimizer,
                       criterion, criterion_T, accuracy, model_dir, args)

    logging.info('Total time: {:.2f} hours'.format(
Example #2
0
     elif "densenet" in args.model:
         model_cfg = getattr(model_fd, 'densenet_GL')
         model = getattr(model_cfg, args.model)(num_classes = num_classes, num_branches = args.num_branches)
     
     
 if torch.cuda.device_count() > 1:
     model = nn.DataParallel(model, device_ids=[0,1,2,3]).to(device)
 else:
     model = model.to(device)
 
 num_params = (sum(p.numel() for p in model.parameters())/1000000.0)
 logging.info('Total params: %.2fM' % num_params)
 
 # Loss and optimizer(SGD with 0.9 momentum)
 criterion = nn.CrossEntropyLoss()
 if args.loss == "KL":
     criterion_T = utils.KL_Loss(args.temperature).to(device)
 elif args.loss == "CE":
     criterion_T = utils.CE_Loss(args.temperature).to(device)
 
 accuracy = utils.accuracy
 optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True, weight_decay = args.wd)    
 
 # Train the model
 logging.info("Starting training for {} epoch(s)".format(args.num_epochs))
 train_and_evaluate(model, train_loader, test_loader, optimizer, criterion, criterion_T, accuracy, model_dir, args)
 
 logging.info('Total time: {:.2f} hours'.format((time.time() - begin_time)/3600.0))
 state['Total params'] = num_params
 params_json_path = os.path.join(model_dir, "parameters.json") # save parameters
 utils.save_dict_to_json(state, params_json_path)
    def train(self):
        best_acc = 0
        accuracy_dict = {}

        for batch in tqdm(range(self.num_epochs)):

            for _ in range(self.ng):
                self.generator_optimizer.zero_grad()
                # generate guassian noise
                z = torch.randn((128, 100)).to(self.device)

                # get generator output
                psuedo_datapoint = self.generator(z)

                # teacher/student outputs: logits, attention1, attention2, attention3
                # compute model output, fetch teacher/student output, and compute KD loss
                student_logits = self.student_model(psuedo_datapoint)[0]
                teacher_logits = self.teacher_model(psuedo_datapoint)[0]

                generator_loss = -(utils.KL_Loss(student_logits, teacher_logits))
                generator_loss.backward()

                # performs updates using calculated gradients
                torch.nn.utils.clip_grad_norm_(self.generator.parameters(), 5)
                self.generator_optimizer.step()

            psuedo_datapoint = psuedo_datapoint.detach()

            with torch.no_grad():
                teacher_outputs = self.teacher_model(psuedo_datapoint)

            for _ in range(self.ns):
                self.student_optimizer.zero_grad()

                # teacher/student outputs: logits, attention1, attention2, attention3
                # compute model output, fetch teacher/student output, and compute KD loss
                student_outputs = self.student_model(psuedo_datapoint)

                # student_loss = KL_Loss(teacher_logits, teacher_outputs)
                student_loss = utils.student_loss_zero_shot(student_outputs, teacher_outputs)

                student_loss.backward()
                # performs updates using calculated gradients
                torch.nn.utils.clip_grad_norm_(self.student_model.parameters(), 5)
                self.student_optimizer.step()

            if (batch + 1) % self.log_num == 0 or (batch + 1) == self.num_epochs:
                acc = self.test()

                print(f"\nAccuracy: {acc:05.3f}")
                print(f'Student Loss: {student_loss:05.3f}')
                utils.writeMetrics({"accuracy": acc}, self.acc_counter)
                accuracy_dict[batch] = acc
                utils.log_accuracy("zero_shot.csv", accuracy_dict)
                self.acc_counter += 1
                self.save_model()

                if acc > best_acc:
                    best_acc = acc

            utils.writeMetrics({"Student Loss": student_loss, "Generator Loss": generator_loss}, self.counter)
            self.counter += 1
            self.cosine_annealing_generator.step()
            self.cosine_annealing_student.step()